387 files changed, 45504 insertions, 12320 deletions
diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst
index 985d16e..57d6280 100644
--- a/llvm/docs/CommandGuide/llvm-objcopy.rst
+++ b/llvm/docs/CommandGuide/llvm-objcopy.rst
@@ -309,6 +309,14 @@ them.
  Compress DWARF debug sections in the output, using the specified format.
  Supported formats are ``zlib`` and ``zstd``. Use ``zlib`` if ``<format>`` is omitted.
 
+.. option:: --compress-sections <section>=<format>
+
+ Compress or decompress sections matched by ``<section>`` using the specified
+ format. Supported formats are ``zlib`` and ``zstd``. Specify ``none`` for
+ decompression. When a section is matched by multiple options, the last one
+ wins. A wildcard ``<section>`` starting with '!' is disallowed.
+ Sections within a segment cannot be (de)compressed.
+
 .. option:: --decompress-debug-sections
 
  Decompress any compressed DWARF debug sections in the output.
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 1d4ff52..774729c 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -5557,6 +5557,8 @@ RISC-V:
 
 Sparc:
 
+- ``L``: Print the low-order register of a two-register operand.
+- ``H``: Print the high-order register of a two-register operand.
 - ``r``: No effect.
 
 SystemZ:
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 7588048..ff7fed9 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -182,6 +182,10 @@ Changes to the LLVM tools
   for ELF input to skip the specified symbols when executing other options
   that can change a symbol's name, binding or visibility.
 
+* llvm-objcopy now supports ``--compress-sections`` to compress or decompress
+  arbitrary sections not within a segment.
+  (`#85036 <https://github.com/llvm/llvm-project/pull/85036>`_.)
+
 * llvm-profgen now supports COFF+DWARF binaries. This enables Sample-based PGO
   on Windows using Intel VTune's SEP. For details on usage, see the `end-user
   documentation for SPGO
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index b9b39f3..8d3c029 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -1740,8 +1740,8 @@ public:
     return *this;
   }
 
-  /// \returns the multiplicative inverse for a given modulo.
-  APInt multiplicativeInverse(const APInt &modulo) const;
+  /// \returns the multiplicative inverse of an odd APInt modulo 2^BitWidth.
+  APInt multiplicativeInverse() const;
 
   /// @}
   /// \name Building-block Operations for APInt and APFloat
diff --git a/llvm/include/llvm/ADT/iterator_range.h b/llvm/include/llvm/ADT/iterator_range.h
index 7d288ea..6c66def 100644
--- a/llvm/include/llvm/ADT/iterator_range.h
+++ b/llvm/include/llvm/ADT/iterator_range.h
@@ -48,9 +48,10 @@ public:
   // See https://github.com/llvm/llvm-project/issues/63843
   template <typename Container>
 #else
-  template <typename Container,
-            std::enable_if_t<explicitly_convertible<
-                detail::IterOfRange<Container>, IteratorT>::value> * = nullptr>
+  template <
+      typename Container,
+      std::enable_if_t<explicitly_convertible<
+          llvm::detail::IterOfRange<Container>, IteratorT>::value> * = nullptr>
 #endif
   iterator_range(Container &&c)
       : begin_iterator(adl_begin(c)), end_iterator(adl_end(c)) {
@@ -65,7 +66,8 @@ public:
 };
 
 template <typename Container>
-iterator_range(Container &&) -> iterator_range<detail::IterOfRange<Container>>;
+iterator_range(Container &&)
+    -> iterator_range<llvm::detail::IterOfRange<Container>>;
 
 /// Convenience function for iterating over sub-ranges.
 ///
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index bad0a77..fa9392b 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -190,7 +190,10 @@ enum class TailFoldingStyle {
   /// Use predicate to control both data and control flow, but modify
   /// the trip count so that a runtime overflow check can be avoided
   /// and such that the scalar epilogue loop can always be removed.
-  DataAndControlFlowWithoutRuntimeCheck
+  DataAndControlFlowWithoutRuntimeCheck,
+  /// Use predicated EVL instructions for tail-folding.
+  /// Indicates that VP intrinsics should be used.
+  DataWithEVL,
 };
 
 struct TailFoldingInfo {
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 877f3f7..ed267c1 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -1712,11 +1712,6 @@ enum {
   NT_ANDROID_TYPE_MEMTAG = 4,
 };
 
-// ARM note types.
-enum {
-  NT_ARM_TYPE_PAUTH_ABI_TAG = 1,
-};
-
 // Memory tagging values used in NT_ANDROID_TYPE_MEMTAG notes.
 enum {
   // Enumeration to determine the tagging mode. In Android-land, 'SYNC' means
@@ -1740,6 +1735,7 @@ enum : unsigned {
   GNU_PROPERTY_STACK_SIZE = 1,
   GNU_PROPERTY_NO_COPY_ON_PROTECTED = 2,
   GNU_PROPERTY_AARCH64_FEATURE_1_AND = 0xc0000000,
+  GNU_PROPERTY_AARCH64_FEATURE_PAUTH = 0xc0000001,
   GNU_PROPERTY_X86_FEATURE_1_AND = 0xc0000002,
 
   GNU_PROPERTY_X86_UINT32_OR_LO = 0xc0008000,
@@ -1758,6 +1754,26 @@ enum : unsigned {
   GNU_PROPERTY_AARCH64_FEATURE_1_GCS = 1 << 2,
 };
 
+// aarch64 PAuth platforms.
+enum : unsigned {
+  AARCH64_PAUTH_PLATFORM_INVALID = 0x0,
+  AARCH64_PAUTH_PLATFORM_BAREMETAL = 0x1,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX = 0x10000002,
+};
+
+// Bit positions of version flags for AARCH64_PAUTH_PLATFORM_LLVM_LINUX.
+enum : unsigned {
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_INTRINSICS = 0,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_CALLS = 1,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_RETURNS = 2,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_AUTHTRAPS = 3,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_VPTRADDRDISCR = 4,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_VPTRTYPEDISCR = 5,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_INITFINI = 6,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_LAST =
+      AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_INITFINI,
+};
+
 // x86 processor feature bits.
 enum : unsigned {
   GNU_PROPERTY_X86_FEATURE_1_IBT = 1 << 0,
diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/AArch64.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/AArch64.def
index 5fb3fa4..cb05db8 100644
--- a/llvm/include/llvm/BinaryFormat/ELFRelocs/AArch64.def
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/AArch64.def
@@ -1,18 +1,19 @@
-
 #ifndef ELF_RELOC
 #error "ELF_RELOC must be defined"
 #endif
 
-// Based on ABI release 1.1-beta, dated 6 November 2013. NB: The cover page of
-// this document, IHI0056C_beta_aaelf64.pdf, on infocenter.arm.com, still
-// labels this as release 1.0.
+// Based on released ABI: https://github.com/ARM-software/abi-aa, aaelf64.
+// ELF64
+// Null relocation: also 0x100 for ELF64
 ELF_RELOC(R_AARCH64_NONE,                                0)
+// Data relocations
 ELF_RELOC(R_AARCH64_ABS64,                           0x101)
 ELF_RELOC(R_AARCH64_ABS32,                           0x102)
 ELF_RELOC(R_AARCH64_ABS16,                           0x103)
 ELF_RELOC(R_AARCH64_PREL64,                          0x104)
 ELF_RELOC(R_AARCH64_PREL32,                          0x105)
 ELF_RELOC(R_AARCH64_PREL16,                          0x106)
+// Static AArch64 relocations
 ELF_RELOC(R_AARCH64_MOVW_UABS_G0,                    0x107)
 ELF_RELOC(R_AARCH64_MOVW_UABS_G0_NC,                 0x108)
 ELF_RELOC(R_AARCH64_MOVW_UABS_G1,                    0x109)
@@ -60,11 +61,13 @@ ELF_RELOC(R_AARCH64_LD64_GOT_LO12_NC,                0x138)
 ELF_RELOC(R_AARCH64_LD64_GOTPAGE_LO15,               0x139)
 ELF_RELOC(R_AARCH64_PLT32,                           0x13a)
 ELF_RELOC(R_AARCH64_GOTPCREL32,                      0x13b)
+// General dynamic TLS relocations
 ELF_RELOC(R_AARCH64_TLSGD_ADR_PREL21,                0x200)
 ELF_RELOC(R_AARCH64_TLSGD_ADR_PAGE21,                0x201)
 ELF_RELOC(R_AARCH64_TLSGD_ADD_LO12_NC,               0x202)
 ELF_RELOC(R_AARCH64_TLSGD_MOVW_G1,                   0x203)
 ELF_RELOC(R_AARCH64_TLSGD_MOVW_G0_NC,                0x204)
+// Local dynamic TLS relocations
 ELF_RELOC(R_AARCH64_TLSLD_ADR_PREL21,                0x205)
 ELF_RELOC(R_AARCH64_TLSLD_ADR_PAGE21,                0x206)
 ELF_RELOC(R_AARCH64_TLSLD_ADD_LO12_NC,               0x207)
@@ -92,6 +95,7 @@ ELF_RELOC(R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC,       0x21c)
 ELF_RELOC(R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21,       0x21d)
 ELF_RELOC(R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC,     0x21e)
 ELF_RELOC(R_AARCH64_TLSIE_LD_GOTTPREL_PREL19,        0x21f)
+// Local exec TLS relocations
 ELF_RELOC(R_AARCH64_TLSLE_MOVW_TPREL_G2,             0x220)
 ELF_RELOC(R_AARCH64_TLSLE_MOVW_TPREL_G1,             0x221)
 ELF_RELOC(R_AARCH64_TLSLE_MOVW_TPREL_G1_NC,          0x222)
@@ -108,6 +112,7 @@ ELF_RELOC(R_AARCH64_TLSLE_LDST32_TPREL_LO12,         0x22c)
 ELF_RELOC(R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC,      0x22d)
 ELF_RELOC(R_AARCH64_TLSLE_LDST64_TPREL_LO12,         0x22e)
 ELF_RELOC(R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC,      0x22f)
+// TLS descriptor relocations
 ELF_RELOC(R_AARCH64_TLSDESC_LD_PREL19,               0x230)
 ELF_RELOC(R_AARCH64_TLSDESC_ADR_PREL21,              0x231)
 ELF_RELOC(R_AARCH64_TLSDESC_ADR_PAGE21,              0x232)
@@ -122,8 +127,7 @@ ELF_RELOC(R_AARCH64_TLSLE_LDST128_TPREL_LO12,        0x23a)
 ELF_RELOC(R_AARCH64_TLSLE_LDST128_TPREL_LO12_NC,     0x23b)
 ELF_RELOC(R_AARCH64_TLSLD_LDST128_DTPREL_LO12,       0x23c)
 ELF_RELOC(R_AARCH64_TLSLD_LDST128_DTPREL_LO12_NC,    0x23d)
-ELF_RELOC(R_AARCH64_AUTH_ABS64,                      0x244)
-// Dynamic relocations start
+// Dynamic relocations
 ELF_RELOC(R_AARCH64_COPY,                            0x400)
 ELF_RELOC(R_AARCH64_GLOB_DAT,                        0x401)
 ELF_RELOC(R_AARCH64_JUMP_SLOT,                       0x402)
@@ -136,8 +140,12 @@ ELF_RELOC(R_AARCH64_TLS_DTPREL64,                    0x405)
 ELF_RELOC(R_AARCH64_TLS_TPREL64,                     0x406)
 ELF_RELOC(R_AARCH64_TLSDESC,                         0x407)
 ELF_RELOC(R_AARCH64_IRELATIVE,                       0x408)
+// PAuthABI static and dynamic relocations: defined in pauthabielf64,
+// https://github.com/ARM-software/abi-aa
+ELF_RELOC(R_AARCH64_AUTH_ABS64,                      0x244)
 ELF_RELOC(R_AARCH64_AUTH_RELATIVE,                   0x411)
 
+// ELF32
 // ELF_RELOC(R_AARCH64_P32_NONE,                         0)
 ELF_RELOC(R_AARCH64_P32_ABS32,                       0x001)
 ELF_RELOC(R_AARCH64_P32_ABS16,                       0x002)
@@ -216,7 +224,7 @@ ELF_RELOC(R_AARCH64_P32_TLSDESC_ADR_PAGE21,          0x07c)
 ELF_RELOC(R_AARCH64_P32_TLSDESC_LD32_LO12,           0x07d)
 ELF_RELOC(R_AARCH64_P32_TLSDESC_ADD_LO12,            0x07e)
 ELF_RELOC(R_AARCH64_P32_TLSDESC_CALL,                0x07f)
-// Dynamic relocations start
+// Dynamic relocations
 ELF_RELOC(R_AARCH64_P32_COPY,                        0x0b4)
 ELF_RELOC(R_AARCH64_P32_GLOB_DAT,                    0x0b5)
 ELF_RELOC(R_AARCH64_P32_JUMP_SLOT,                   0x0b6)
diff --git a/llvm/include/llvm/IR/AutoUpgrade.h b/llvm/include/llvm/IR/AutoUpgrade.h
index 152f781..97c3e4d 100644
--- a/llvm/include/llvm/IR/AutoUpgrade.h
+++ b/llvm/include/llvm/IR/AutoUpgrade.h
@@ -36,7 +36,8 @@ namespace llvm {
   /// for upgrading, and returns true if it requires upgrading. It may return
   /// null in NewFn if the all calls to the original intrinsic function
   /// should be transformed to non-function-call instructions.
-  bool UpgradeIntrinsicFunction(Function *F, Function *&NewFn);
+  bool UpgradeIntrinsicFunction(Function *F, Function *&NewFn,
+                                bool CanUpgradeDebugIntrinsicsToRecords = true);
 
   /// This is the complement to the above, replacing a specific call to an
   /// intrinsic function with a call to the specified new function.
diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h
index c947713..9f49874 100644
--- a/llvm/include/llvm/IR/DebugProgramInstruction.h
+++ b/llvm/include/llvm/IR/DebugProgramInstruction.h
@@ -659,6 +659,25 @@ getDbgRecordRange(DbgMarker *DebugMarker) {
 
 DEFINE_ISA_CONVERSION_FUNCTIONS(DbgRecord, LLVMDbgRecordRef)
 
+/// Used to temporarily set the debug info format of a function, module, or
+/// basic block for the duration of this object's lifetime, after which the
+/// prior state will be restored.
+template <typename T> class ScopedDbgInfoFormatSetter {
+  T &Obj;
+  bool OldState;
+
+public:
+  ScopedDbgInfoFormatSetter(T &Obj, bool NewState)
+      : Obj(Obj), OldState(Obj.IsNewDbgInfoFormat) {
+    Obj.setIsNewDbgInfoFormat(NewState);
+  }
+  ~ScopedDbgInfoFormatSetter() { Obj.setIsNewDbgInfoFormat(OldState); }
+};
+
+template <typename T>
+ScopedDbgInfoFormatSetter(T &Obj,
+                          bool NewState) -> ScopedDbgInfoFormatSetter<T>;
+
 } // namespace llvm
 
 #endif // LLVM_IR_DEBUGPROGRAMINSTRUCTION_H
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index c04f4c5..f0723a6 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1733,8 +1733,7 @@ def int_ubsantrap : Intrinsic<[], [llvm_i8_ty],
 
 // Return true if ubsan check is allowed.
 def int_allow_ubsan_check : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_i8_ty],
-    [IntrInaccessibleMemOnly, IntrWriteMem, ImmArg<ArgIndex<0>>, NoUndef<RetIndex>]>,
-    ClangBuiltin<"__builtin_allow_ubsan_check">;
+    [IntrInaccessibleMemOnly, IntrWriteMem, ImmArg<ArgIndex<0>>, NoUndef<RetIndex>]>;
 
 // Return true if runtime check is allowed.
 def int_allow_runtime_check : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_metadata_ty],
diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h
index 1084654..d701481 100644
--- a/llvm/include/llvm/IR/PassManager.h
+++ b/llvm/include/llvm/IR/PassManager.h
@@ -64,23 +64,6 @@ extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
 
 namespace llvm {
 
-// RemoveDIs: Provide facilities for converting debug-info from one form to
-// another, which are no-ops for everything but modules.
-template <class IRUnitT> inline bool shouldConvertDbgInfo(IRUnitT &IR) {
-  return false;
-}
-template <> inline bool shouldConvertDbgInfo(Module &IR) {
-  return !IR.IsNewDbgInfoFormat && UseNewDbgInfoFormat;
-}
-template <class IRUnitT> inline void doConvertDbgInfoToNew(IRUnitT &IR) {}
-template <> inline void doConvertDbgInfoToNew(Module &IR) {
-  IR.convertToNewDbgValues();
-}
-template <class IRUnitT> inline void doConvertDebugInfoToOld(IRUnitT &IR) {}
-template <> inline void doConvertDebugInfoToOld(Module &IR) {
-  IR.convertFromNewDbgValues();
-}
-
 // Forward declare the analysis manager template.
 template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager;
 
@@ -229,9 +212,7 @@ public:
 
     // RemoveDIs: if requested, convert debug-info to DbgRecord representation
     // for duration of these passes.
-    bool ShouldConvertDbgInfo = shouldConvertDbgInfo(IR);
-    if (ShouldConvertDbgInfo)
-      doConvertDbgInfoToNew(IR);
+    ScopedDbgInfoFormatSetter FormatSetter(IR, UseNewDbgInfoFormat);
 
     for (auto &Pass : Passes) {
       // Check the PassInstrumentation's BeforePass callbacks before running the
@@ -255,9 +236,6 @@ public:
       PA.intersect(std::move(PassPA));
     }
 
-    if (ShouldConvertDbgInfo)
-      doConvertDebugInfoToOld(IR);
-
     // Invalidation was handled after each pass in the above loop for the
     // current unit of IR. Therefore, the remaining analysis results in the
     // AnalysisManager are preserved. We mark this with a set so that we don't
diff --git a/llvm/include/llvm/IR/PrintPasses.h b/llvm/include/llvm/IR/PrintPasses.h
index 3803bd0..95b97e7 100644
--- a/llvm/include/llvm/IR/PrintPasses.h
+++ b/llvm/include/llvm/IR/PrintPasses.h
@@ -78,25 +78,6 @@ std::string doSystemDiff(StringRef Before, StringRef After,
                          StringRef OldLineFormat, StringRef NewLineFormat,
                          StringRef UnchangedLineFormat);
 
-/// Used to temporarily set the debug info format of a function, module, or
-/// basic block for the duration of this object's lifetime, after which the
-/// prior state will be restored.
-template <typename T> class ScopedDbgInfoFormatSetter {
-  T &Obj;
-  bool OldState;
-
-public:
-  ScopedDbgInfoFormatSetter(T &Obj, bool NewState)
-      : Obj(Obj), OldState(Obj.IsNewDbgInfoFormat) {
-    Obj.setIsNewDbgInfoFormat(NewState);
-  }
-  ~ScopedDbgInfoFormatSetter() { Obj.setIsNewDbgInfoFormat(OldState); }
-};
-
-template <typename T>
-ScopedDbgInfoFormatSetter(T &Obj, bool NewState)
-    -> ScopedDbgInfoFormatSetter<T>;
-
 } // namespace llvm
 
 #endif // LLVM_IR_PRINTPASSES_H
diff --git a/llvm/include/llvm/ObjCopy/CommonConfig.h b/llvm/include/llvm/ObjCopy/CommonConfig.h
index 9d6d5fb..ae08d40 100644
--- a/llvm/include/llvm/ObjCopy/CommonConfig.h
+++ b/llvm/include/llvm/ObjCopy/CommonConfig.h
@@ -262,6 +262,9 @@ struct CommonConfig {
   bool DecompressDebugSections = false;
 
   DebugCompressionType CompressionType = DebugCompressionType::None;
+
+  SmallVector<std::pair<NameMatcher, llvm::DebugCompressionType>, 0>
+      compressSections;
 };
 
 } // namespace objcopy
diff --git a/llvm/include/llvm/Object/WindowsMachineFlag.h b/llvm/include/llvm/Object/WindowsMachineFlag.h
index 05b8f0d..1cb408e 100644
--- a/llvm/include/llvm/Object/WindowsMachineFlag.h
+++ b/llvm/include/llvm/Object/WindowsMachineFlag.h
@@ -13,6 +13,9 @@
 #ifndef LLVM_OBJECT_WINDOWSMACHINEFLAG_H
 #define LLVM_OBJECT_WINDOWSMACHINEFLAG_H
 
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/TargetParser/Triple.h"
+
 namespace llvm {
 
 class StringRef;
@@ -28,6 +31,23 @@ StringRef machineToStr(COFF::MachineTypes MT);
 // Only returns ARMNT, ARM64, AMD64, I386, or IMAGE_FILE_MACHINE_UNKNOWN.
 COFF::MachineTypes getMachineType(StringRef S);
 
+template <typename T> Triple::ArchType getMachineArchType(T machine) {
+  switch (machine) {
+  case COFF::IMAGE_FILE_MACHINE_I386:
+    return llvm::Triple::ArchType::x86;
+  case COFF::IMAGE_FILE_MACHINE_AMD64:
+    return llvm::Triple::ArchType::x86_64;
+  case COFF::IMAGE_FILE_MACHINE_ARMNT:
+    return llvm::Triple::ArchType::thumb;
+  case COFF::IMAGE_FILE_MACHINE_ARM64:
+  case COFF::IMAGE_FILE_MACHINE_ARM64EC:
+  case COFF::IMAGE_FILE_MACHINE_ARM64X:
+    return llvm::Triple::ArchType::aarch64;
+  default:
+    return llvm::Triple::ArchType::UnknownArch;
+  }
 }
 
+} // namespace llvm
+
 #endif
diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index ff00900..0431c18 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -22,6 +22,8 @@ enum IndexedVersion : uint64_t {
   Version0 = 0,
   // Version 1: Added a version field to the header.
   Version1 = 1,
+  // Version 2: Added a call stack table.  Under development.
+  Version2 = 2,
 };
 
 constexpr uint64_t MinimumSupportedVersion = Version0;
@@ -289,23 +291,14 @@ struct IndexedAllocationInfo {
       : CallStack(CS.begin(), CS.end()), CSId(CSId), Info(MB) {}
 
   // Returns the size in bytes when this allocation info struct is serialized.
-  size_t serializedSize() const {
-    return sizeof(uint64_t) + // The number of frames to serialize.
-           sizeof(FrameId) * CallStack.size() +    // The callstack frame ids.
-           PortableMemInfoBlock::serializedSize(); // The size of the payload.
-  }
+  size_t serializedSize(IndexedVersion Version) const;
 
   bool operator==(const IndexedAllocationInfo &Other) const {
     if (Other.Info != Info)
       return false;
 
-    if (Other.CallStack.size() != CallStack.size())
+    if (Other.CSId != CSId)
       return false;
-
-    for (size_t J = 0; J < Other.CallStack.size(); J++) {
-      if (Other.CallStack[J] != CallStack[J])
-        return false;
-    }
     return true;
   }
 
@@ -357,6 +350,9 @@ struct IndexedMemProfRecord {
   // inline location list may include additional entries, users should pick
   // the last entry in the list with the same function GUID.
   llvm::SmallVector<llvm::SmallVector<FrameId>> CallSites;
+  // Conceptually the same as above.  We are going to keep both CallSites and
+  // CallSiteIds while we are transitioning from CallSites to CallSiteIds.
+  llvm::SmallVector<CallStackId> CallSiteIds;
 
   void clear() {
     AllocSites.clear();
@@ -370,47 +366,31 @@ struct IndexedMemProfRecord {
     CallSites.append(Other.CallSites);
   }
 
-  size_t serializedSize() const {
-    size_t Result = sizeof(GlobalValue::GUID);
-    for (const IndexedAllocationInfo &N : AllocSites)
-      Result += N.serializedSize();
-
-    // The number of callsites we have information for.
-    Result += sizeof(uint64_t);
-    for (const auto &Frames : CallSites) {
-      // The number of frame ids to serialize.
-      Result += sizeof(uint64_t);
-      Result += Frames.size() * sizeof(FrameId);
-    }
-    return Result;
-  }
+  size_t serializedSize(IndexedVersion Version) const;
 
   bool operator==(const IndexedMemProfRecord &Other) const {
     if (Other.AllocSites.size() != AllocSites.size())
       return false;
 
-    if (Other.CallSites.size() != CallSites.size())
-      return false;
-
     for (size_t I = 0; I < AllocSites.size(); I++) {
       if (AllocSites[I] != Other.AllocSites[I])
         return false;
     }
 
-    for (size_t I = 0; I < CallSites.size(); I++) {
-      if (CallSites[I] != Other.CallSites[I])
-        return false;
-    }
+    if (Other.CallSiteIds != CallSiteIds)
+      return false;
     return true;
   }
 
   // Serializes the memprof records in \p Records to the ostream \p OS based
   // on the schema provided in \p Schema.
-  void serialize(const MemProfSchema &Schema, raw_ostream &OS);
+  void serialize(const MemProfSchema &Schema, raw_ostream &OS,
+                 IndexedVersion Version);
 
   // Deserializes memprof records from the Buffer.
   static IndexedMemProfRecord deserialize(const MemProfSchema &Schema,
-                                          const unsigned char *Buffer);
+                                          const unsigned char *Buffer,
+                                          IndexedVersion Version);
 
   // Returns the GUID for the function name after canonicalization. For
   // memprof, we remove any .llvm suffix added by LTO. MemProfRecords are
@@ -480,7 +460,8 @@ public:
   using offset_type = uint64_t;
 
   RecordLookupTrait() = delete;
-  RecordLookupTrait(const MemProfSchema &S) : Schema(S) {}
+  RecordLookupTrait(IndexedVersion V, const MemProfSchema &S)
+      : Version(V), Schema(S) {}
 
   static bool EqualKey(uint64_t A, uint64_t B) { return A == B; }
   static uint64_t GetInternalKey(uint64_t K) { return K; }
@@ -507,11 +488,13 @@ public:
 
   data_type ReadData(uint64_t K, const unsigned char *D,
                      offset_type /*Unused*/) {
-    Record = IndexedMemProfRecord::deserialize(Schema, D);
+    Record = IndexedMemProfRecord::deserialize(Schema, D, Version);
     return Record;
   }
 
 private:
+  // Holds the MemProf version.
+  IndexedVersion Version;
   // Holds the memprof schema used to deserialize records.
   MemProfSchema Schema;
   // Holds the records from one function deserialized from the indexed format.
@@ -534,19 +517,23 @@ public:
   // we must use a default constructor with no params for the writer trait so we
   // have a public member which must be initialized by the user.
   MemProfSchema *Schema = nullptr;
+  // The MemProf version to use for the serialization.
+  IndexedVersion Version;
 
-  RecordWriterTrait() = default;
+  // We do not support the default constructor, which does not set Version.
+  RecordWriterTrait() = delete;
+  RecordWriterTrait(IndexedVersion V) : Version(V) {}
 
   static hash_value_type ComputeHash(key_type_ref K) { return K; }
 
-  static std::pair<offset_type, offset_type>
+  std::pair<offset_type, offset_type>
   EmitKeyDataLength(raw_ostream &Out, key_type_ref K, data_type_ref V) {
     using namespace support;
 
     endian::Writer LE(Out, llvm::endianness::little);
     offset_type N = sizeof(K);
     LE.write<offset_type>(N);
-    offset_type M = V.serializedSize();
+    offset_type M = V.serializedSize(Version);
     LE.write<offset_type>(M);
     return std::make_pair(N, M);
   }
@@ -560,7 +547,7 @@ public:
   void EmitData(raw_ostream &Out, key_type_ref /*Unused*/, data_type_ref V,
                 offset_type /*Unused*/) {
     assert(Schema != nullptr && "MemProf schema is not initialized!");
-    V.serialize(*Schema, Out);
+    V.serialize(*Schema, Out, Version);
     // Clear the IndexedMemProfRecord which results in clearing/freeing its
     // vectors of allocs and callsites. This is owned by the associated on-disk
     // hash table, but unused after this point. See also the comment added to
diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h
index 8ac84d4..51d590b 100644
--- a/llvm/include/llvm/ProfileData/SampleProf.h
+++ b/llvm/include/llvm/ProfileData/SampleProf.h
@@ -466,7 +466,7 @@ struct SampleContextFrame {
   LineLocation Location;
 
   SampleContextFrame() : Location(0, 0) {}
-  
+
   SampleContextFrame(FunctionId Func, LineLocation Location)
       : Func(Func), Location(Location) {}
 
@@ -527,7 +527,7 @@ public:
       : Func(Name), State(UnknownContext), Attributes(ContextNone) {
         assert(!Name.empty() && "Name is empty");
       }
-  
+
   SampleContext(FunctionId Func)
       : Func(Func), State(UnknownContext), Attributes(ContextNone) {}
 
diff --git a/llvm/include/llvm/TextAPI/InterfaceFile.h b/llvm/include/llvm/TextAPI/InterfaceFile.h
index 10a37e3..23c27cb 100644
--- a/llvm/include/llvm/TextAPI/InterfaceFile.h
+++ b/llvm/include/llvm/TextAPI/InterfaceFile.h
@@ -299,9 +299,9 @@ public:
   }
 
   /// Set the runpath search paths.
-  /// \param InputTarget The target applicable to runpath search path.
   /// \param RPath The name of runpath.
-  void addRPath(const Target &InputTarget, StringRef RPath);
+  /// \param InputTarget The target applicable to runpath search path.
+  void addRPath(StringRef RPath, const Target &InputTarget);
 
   /// Get the list of runpath search paths.
   ///
diff --git a/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h b/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
index 03aa93c..7f2cc0e 100644
--- a/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
+++ b/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
@@ -85,14 +85,12 @@ private:
   void findInvokeNormalDests(DenseSet<BasicBlock *> &InvokeNormalDests);
   void computeBlocksToIgnore(DenseSet<BasicBlock *> &BlocksToIgnore,
                              DenseSet<BasicBlock *> &BlocksAndCallsToIgnore);
-  void computeProbeIdForCallsites(
-      const DenseSet<BasicBlock *> &BlocksAndCallsToIgnore);
   const Instruction *
   getOriginalTerminator(const BasicBlock *Head,
                         const DenseSet<BasicBlock *> &BlocksToIgnore);
   void computeCFGHash(const DenseSet<BasicBlock *> &BlocksToIgnore);
-  void computeProbeIdForBlocks(const DenseSet<BasicBlock *> &BlocksToIgnore);
-  void computeProbeIdForCallsites();
+  void computeProbeId(const DenseSet<BasicBlock *> &BlocksToIgnore,
+                      const DenseSet<BasicBlock *> &BlocksAndCallsToIgnore);
 
   Function *F;
 
diff --git a/llvm/include/llvm/Transforms/Instrumentation/RemoveTrapsPass.h b/llvm/include/llvm/Transforms/Instrumentation/RemoveTrapsPass.h
index 58f6bbc..bae1584 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/RemoveTrapsPass.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/RemoveTrapsPass.h
@@ -25,6 +25,8 @@ namespace llvm {
 class RemoveTrapsPass : public PassInfoMixin<RemoveTrapsPass> {
 public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool IsRequested();
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
index d898ee5..581d354 100644
--- a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
+++ b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
@@ -129,16 +129,28 @@ public:
 
   bool profileIsValid(const Function &F, const FunctionSamples &Samples) const {
     const auto *Desc = getDesc(F);
-    assert((LTOPhase != ThinOrFullLTOPhase::ThinLTOPostLink || !Desc ||
+    bool IsAvailableExternallyLinkage =
+        GlobalValue::isAvailableExternallyLinkage(F.getLinkage());
+    // Always check the function attribute to determine checksum mismatch for
+    // `available_externally` functions even if their desc are available. This
+    // is because the desc is computed based on the original internal function
+    // and it's substituted by the `available_externally` function during link
+    // time. However, when unstable IR or ODR violation issue occurs, the
+    // definitions of the same function across different translation units could
+    // be different and result in different checksums. So we should use the
+    // state from the new (available_externally) function, which is saved in its
+    // attribute.
+    assert((LTOPhase != ThinOrFullLTOPhase::ThinLTOPostLink ||
+            IsAvailableExternallyLinkage || !Desc ||
             profileIsHashMismatched(*Desc, Samples) ==
                 F.hasFnAttribute("profile-checksum-mismatch")) &&
-           "In post-link, profile checksum matching state doesn't match "
-           "function 'profile-checksum-mismatch' attribute.");
+           "In post-link, profile checksum matching state doesn't match the "
+           "internal function's 'profile-checksum-mismatch' attribute.");
     (void)LTOPhase;
-    // The desc for import function is unavailable. Check the function attribute
-    // for mismatch.
-    return (!Desc && !F.hasFnAttribute("profile-checksum-mismatch")) ||
-           (Desc && !profileIsHashMismatched(*Desc, Samples));
+    if (IsAvailableExternallyLinkage || !Desc)
+      return !F.hasFnAttribute("profile-checksum-mismatch");
+
+    return Desc && !profileIsHashMismatched(*Desc, Samples);
   }
 };
 
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index b8bc811..6cded82 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -588,10 +588,14 @@ LazyValueInfoImpl::getBlockValue(Value *Val, BasicBlock *BB,
 
 static ValueLatticeElement getFromRangeMetadata(Instruction *BBI) {
   switch (BBI->getOpcode()) {
-  default: break;
-  case Instruction::Load:
+  default:
+    break;
   case Instruction::Call:
   case Instruction::Invoke:
+    if (std::optional<ConstantRange> Range = cast<CallBase>(BBI)->getRange())
+      return ValueLatticeElement::getRange(*Range);
+    [[fallthrough]];
+  case Instruction::Load:
     if (MDNode *Ranges = BBI->getMetadata(LLVMContext::MD_range))
       if (isa<IntegerType>(BBI->getType())) {
         return ValueLatticeElement::getRange(
@@ -706,10 +710,11 @@ std::optional<ValueLatticeElement>
 LazyValueInfoImpl::solveBlockValueNonLocal(Value *Val, BasicBlock *BB) {
   ValueLatticeElement Result;  // Start Undefined.
 
-  // If this is the entry block, we must be asking about an argument.  The
-  // value is overdefined.
+  // If this is the entry block, we must be asking about an argument.
   if (BB->isEntryBlock()) {
     assert(isa<Argument>(Val) && "Unknown live-in to the entry block");
+    if (std::optional<ConstantRange> Range = cast<Argument>(Val)->getRange())
+      return ValueLatticeElement::getRange(*Range);
     return ValueLatticeElement::getOverdefined();
   }
 
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 515b9d0..e030b9f 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -944,10 +944,7 @@ static const SCEV *BinomialCoefficient(const SCEV *It, unsigned K,
   // Calculate the multiplicative inverse of K! / 2^T;
   // this multiplication factor will perform the exact division by
   // K! / 2^T.
-  APInt Mod = APInt::getSignedMinValue(W+1);
-  APInt MultiplyFactor = OddFactorial.zext(W+1);
-  MultiplyFactor = MultiplyFactor.multiplicativeInverse(Mod);
-  MultiplyFactor = MultiplyFactor.trunc(W);
+  APInt MultiplyFactor = OddFactorial.multiplicativeInverse();
 
   // Calculate the product, at width T+W
   IntegerType *CalculationTy = IntegerType::get(SE.getContext(),
@@ -10086,10 +10083,8 @@ static const SCEV *SolveLinEquationWithOverflow(const APInt &A, const SCEV *B,
   // If D == 1, (N / D) == N == 2^BW, so we need one extra bit to represent
   // (N / D) in general. The inverse itself always fits into BW bits, though,
   // so we immediately truncate it.
-  APInt AD = A.lshr(Mult2).zext(BW + 1);  // AD = A / D
-  APInt Mod(BW + 1, 0);
-  Mod.setBit(BW - Mult2);  // Mod = N / D
-  APInt I = AD.multiplicativeInverse(Mod).trunc(BW);
+  APInt AD = A.lshr(Mult2).trunc(BW - Mult2); // AD = A / D
+  APInt I = AD.multiplicativeInverse().zext(BW);
 
   // 4. Compute the minimum unsigned root of the equation:
   // I * (B / D) mod (N / D)
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index b5e8a1d..5ad4da4 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -648,6 +648,7 @@ static void computeKnownBitsFromCmp(const Value *V, CmpInst::Predicate Pred,
   auto m_V =
       m_CombineOr(m_Specific(V), m_PtrToIntSameSize(Q.DL, m_Specific(V)));
 
+  Value *Y;
   const APInt *Mask, *C;
   uint64_t ShAmt;
   switch (Pred) {
@@ -656,16 +657,18 @@ static void computeKnownBitsFromCmp(const Value *V, CmpInst::Predicate Pred,
     if (match(LHS, m_V) && match(RHS, m_APInt(C))) {
       Known = Known.unionWith(KnownBits::makeConstant(*C));
       // assume(V & Mask = C)
-    } else if (match(LHS, m_And(m_V, m_APInt(Mask))) &&
+    } else if (match(LHS, m_c_And(m_V, m_Value(Y))) &&
                match(RHS, m_APInt(C))) {
       // For one bits in Mask, we can propagate bits from C to V.
-      Known.Zero |= ~*C & *Mask;
-      Known.One |= *C & *Mask;
+      Known.One |= *C;
+      if (match(Y, m_APInt(Mask)))
+        Known.Zero |= ~*C & *Mask;
       // assume(V | Mask = C)
-    } else if (match(LHS, m_Or(m_V, m_APInt(Mask))) && match(RHS, m_APInt(C))) {
+    } else if (match(LHS, m_c_Or(m_V, m_Value(Y))) && match(RHS, m_APInt(C))) {
       // For zero bits in Mask, we can propagate bits from C to V.
-      Known.Zero |= ~*C & ~*Mask;
-      Known.One |= *C & ~*Mask;
+      Known.Zero |= ~*C;
+      if (match(Y, m_APInt(Mask)))
+        Known.One |= *C & ~*Mask;
       // assume(V ^ Mask = C)
     } else if (match(LHS, m_Xor(m_V, m_APInt(Mask))) &&
                match(RHS, m_APInt(C))) {
@@ -8390,8 +8393,7 @@ bool llvm::matchSimpleRecurrence(const BinaryOperator *I, PHINode *&P,
 
 /// Return true if "icmp Pred LHS RHS" is always true.
 static bool isTruePredicate(CmpInst::Predicate Pred, const Value *LHS,
-                            const Value *RHS, const DataLayout &DL,
-                            unsigned Depth) {
+                            const Value *RHS) {
   if (ICmpInst::isTrueWhenEqual(Pred) && LHS == RHS)
     return true;
 
@@ -8403,8 +8405,26 @@ static bool isTruePredicate(CmpInst::Predicate Pred, const Value *LHS,
     const APInt *C;
 
     // LHS s<= LHS +_{nsw} C   if C >= 0
-    if (match(RHS, m_NSWAdd(m_Specific(LHS), m_APInt(C))))
+    // LHS s<= LHS | C         if C >= 0
+    if (match(RHS, m_NSWAdd(m_Specific(LHS), m_APInt(C))) ||
+        match(RHS, m_Or(m_Specific(LHS), m_APInt(C))))
       return !C->isNegative();
+
+    // LHS s<= smax(LHS, V) for any V
+    if (match(RHS, m_c_SMax(m_Specific(LHS), m_Value())))
+      return true;
+
+    // smin(RHS, V) s<= RHS for any V
+    if (match(LHS, m_c_SMin(m_Specific(RHS), m_Value())))
+      return true;
+
+    // Match A to (X +_{nsw} CA) and B to (X +_{nsw} CB)
+    const Value *X;
+    const APInt *CLHS, *CRHS;
+    if (match(LHS, m_NSWAddLike(m_Value(X), m_APInt(CLHS))) &&
+        match(RHS, m_NSWAddLike(m_Specific(X), m_APInt(CRHS))))
+      return CLHS->sle(*CRHS);
+
     return false;
   }
 
@@ -8414,34 +8434,36 @@ static bool isTruePredicate(CmpInst::Predicate Pred, const Value *LHS,
         cast<OverflowingBinaryOperator>(RHS)->hasNoUnsignedWrap())
       return true;
 
+    // LHS u<= LHS | V for any V
+    if (match(RHS, m_c_Or(m_Specific(LHS), m_Value())))
+      return true;
+
+    // LHS u<= umax(LHS, V) for any V
+    if (match(RHS, m_c_UMax(m_Specific(LHS), m_Value())))
+      return true;
+
     // RHS >> V u<= RHS for any V
     if (match(LHS, m_LShr(m_Specific(RHS), m_Value())))
       return true;
 
-    // Match A to (X +_{nuw} CA) and B to (X +_{nuw} CB)
-    auto MatchNUWAddsToSameValue = [&](const Value *A, const Value *B,
-                                       const Value *&X,
-                                       const APInt *&CA, const APInt *&CB) {
-      if (match(A, m_NUWAdd(m_Value(X), m_APInt(CA))) &&
-          match(B, m_NUWAdd(m_Specific(X), m_APInt(CB))))
-        return true;
+    // RHS u/ C_ugt_1 u<= RHS
+    const APInt *C;
+    if (match(LHS, m_UDiv(m_Specific(RHS), m_APInt(C))) && C->ugt(1))
+      return true;
 
-      // If X & C == 0 then (X | C) == X +_{nuw} C
-      if (match(A, m_Or(m_Value(X), m_APInt(CA))) &&
-          match(B, m_Or(m_Specific(X), m_APInt(CB)))) {
-        KnownBits Known(CA->getBitWidth());
-        computeKnownBits(X, Known, DL, Depth + 1, /*AC*/ nullptr,
-                         /*CxtI*/ nullptr, /*DT*/ nullptr);
-        if (CA->isSubsetOf(Known.Zero) && CB->isSubsetOf(Known.Zero))
-          return true;
-      }
+    // RHS & V u<= RHS for any V
+    if (match(LHS, m_c_And(m_Specific(RHS), m_Value())))
+      return true;
 
-      return false;
-    };
+    // umin(RHS, V) u<= RHS for any V
+    if (match(LHS, m_c_UMin(m_Specific(RHS), m_Value())))
+      return true;
 
+    // Match A to (X +_{nuw} CA) and B to (X +_{nuw} CB)
     const Value *X;
     const APInt *CLHS, *CRHS;
-    if (MatchNUWAddsToSameValue(LHS, RHS, X, CLHS, CRHS))
+    if (match(LHS, m_NUWAddLike(m_Value(X), m_APInt(CLHS))) &&
+        match(RHS, m_NUWAddLike(m_Specific(X), m_APInt(CRHS))))
       return CLHS->ule(*CRHS);
 
     return false;
@@ -8453,37 +8475,36 @@ static bool isTruePredicate(CmpInst::Predicate Pred, const Value *LHS,
 /// ALHS ARHS" is true.  Otherwise, return std::nullopt.
 static std::optional<bool>
 isImpliedCondOperands(CmpInst::Predicate Pred, const Value *ALHS,
-                      const Value *ARHS, const Value *BLHS, const Value *BRHS,
-                      const DataLayout &DL, unsigned Depth) {
+                      const Value *ARHS, const Value *BLHS, const Value *BRHS) {
   switch (Pred) {
   default:
     return std::nullopt;
 
   case CmpInst::ICMP_SLT:
   case CmpInst::ICMP_SLE:
-    if (isTruePredicate(CmpInst::ICMP_SLE, BLHS, ALHS, DL, Depth) &&
-        isTruePredicate(CmpInst::ICMP_SLE, ARHS, BRHS, DL, Depth))
+    if (isTruePredicate(CmpInst::ICMP_SLE, BLHS, ALHS) &&
+        isTruePredicate(CmpInst::ICMP_SLE, ARHS, BRHS))
       return true;
     return std::nullopt;
 
   case CmpInst::ICMP_SGT:
   case CmpInst::ICMP_SGE:
-    if (isTruePredicate(CmpInst::ICMP_SLE, ALHS, BLHS, DL, Depth) &&
-        isTruePredicate(CmpInst::ICMP_SLE, BRHS, ARHS, DL, Depth))
+    if (isTruePredicate(CmpInst::ICMP_SLE, ALHS, BLHS) &&
+        isTruePredicate(CmpInst::ICMP_SLE, BRHS, ARHS))
       return true;
     return std::nullopt;
 
   case CmpInst::ICMP_ULT:
   case CmpInst::ICMP_ULE:
-    if (isTruePredicate(CmpInst::ICMP_ULE, BLHS, ALHS, DL, Depth) &&
-        isTruePredicate(CmpInst::ICMP_ULE, ARHS, BRHS, DL, Depth))
+    if (isTruePredicate(CmpInst::ICMP_ULE, BLHS, ALHS) &&
+        isTruePredicate(CmpInst::ICMP_ULE, ARHS, BRHS))
       return true;
     return std::nullopt;
 
   case CmpInst::ICMP_UGT:
   case CmpInst::ICMP_UGE:
-    if (isTruePredicate(CmpInst::ICMP_ULE, ALHS, BLHS, DL, Depth) &&
-        isTruePredicate(CmpInst::ICMP_ULE, BRHS, ARHS, DL, Depth))
+    if (isTruePredicate(CmpInst::ICMP_ULE, ALHS, BLHS) &&
+        isTruePredicate(CmpInst::ICMP_ULE, BRHS, ARHS))
       return true;
     return std::nullopt;
   }
@@ -8527,7 +8548,7 @@ static std::optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
                                               CmpInst::Predicate RPred,
                                               const Value *R0, const Value *R1,
                                               const DataLayout &DL,
-                                              bool LHSIsTrue, unsigned Depth) {
+                                              bool LHSIsTrue) {
   Value *L0 = LHS->getOperand(0);
   Value *L1 = LHS->getOperand(1);
 
@@ -8574,7 +8595,7 @@ static std::optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
     return LPred == RPred;
 
   if (LPred == RPred)
-    return isImpliedCondOperands(LPred, L0, L1, R0, R1, DL, Depth);
+    return isImpliedCondOperands(LPred, L0, L1, R0, R1);
 
   return std::nullopt;
 }
@@ -8636,8 +8657,7 @@ llvm::isImpliedCondition(const Value *LHS, CmpInst::Predicate RHSPred,
   // Both LHS and RHS are icmps.
   const ICmpInst *LHSCmp = dyn_cast<ICmpInst>(LHS);
   if (LHSCmp)
-    return isImpliedCondICmps(LHSCmp, RHSPred, RHSOp0, RHSOp1, DL, LHSIsTrue,
-                              Depth);
+    return isImpliedCondICmps(LHSCmp, RHSPred, RHSOp0, RHSOp1, DL, LHSIsTrue);
 
   /// The LHS should be an 'or', 'and', or a 'select' instruction.  We expect
   /// the RHS to be an icmp.
@@ -9276,11 +9296,17 @@ void llvm::findValuesAffectedByCondition(
 
       if (ICmpInst::isEquality(Pred)) {
         if (match(B, m_ConstantInt())) {
+          Value *Y;
           // (X & C) or (X | C) or (X ^ C).
           // (X << C) or (X >>_s C) or (X >>_u C).
           if (match(A, m_BitwiseLogic(m_Value(X), m_ConstantInt())) ||
               match(A, m_Shift(m_Value(X), m_ConstantInt())))
             AddAffected(X);
+          else if (match(A, m_And(m_Value(X), m_Value(Y))) ||
+                   match(A, m_Or(m_Value(X), m_Value(Y)))) {
+            AddAffected(X);
+            AddAffected(Y);
+          }
         }
       } else {
         // Handle (A + C1) u< C2, which is the canonical form of
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
index de2396f..4f2486c 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
@@ -21,19 +21,14 @@ using namespace llvm;
 extern bool WriteNewDbgInfoFormatToBitcode;
 
 PreservedAnalyses BitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
-  bool ConvertToOldDbgFormatForWrite =
-      M.IsNewDbgInfoFormat && !WriteNewDbgInfoFormatToBitcode;
-  if (ConvertToOldDbgFormatForWrite)
-    M.convertFromNewDbgValues();
+  ScopedDbgInfoFormatSetter FormatSetter(M, M.IsNewDbgInfoFormat &&
+                                                WriteNewDbgInfoFormatToBitcode);
 
   const ModuleSummaryIndex *Index =
       EmitSummaryIndex ? &(AM.getResult<ModuleSummaryIndexAnalysis>(M))
                        : nullptr;
   WriteBitcodeToFile(M, OS, ShouldPreserveUseListOrder, Index, EmitModuleHash);
 
-  if (ConvertToOldDbgFormatForWrite)
-    M.convertToNewDbgValues();
-
   return PreservedAnalyses::all();
 }
 
@@ -57,16 +52,12 @@ namespace {
     StringRef getPassName() const override { return "Bitcode Writer"; }
 
     bool runOnModule(Module &M) override {
-      bool ConvertToOldDbgFormatForWrite =
-          M.IsNewDbgInfoFormat && !WriteNewDbgInfoFormatToBitcode;
-      if (ConvertToOldDbgFormatForWrite)
-        M.convertFromNewDbgValues();
+      ScopedDbgInfoFormatSetter FormatSetter(
+          M, M.IsNewDbgInfoFormat && WriteNewDbgInfoFormatToBitcode);
 
       WriteBitcodeToFile(M, OS, ShouldPreserveUseListOrder, /*Index=*/nullptr,
                          /*EmitModuleHash=*/false);
 
-      if (ConvertToOldDbgFormatForWrite)
-        M.convertToNewDbgValues();
       return false;
     }
     void getAnalysisUsage(AnalysisUsage &AU) const override {
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index a155387..293bb5a 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2927,7 +2927,7 @@ bool AsmPrinter::emitSpecialLLVMGlobal(const GlobalVariable *GV) {
     return true;
   }
 
-  report_fatal_error("unknown special variable");
+  report_fatal_error("unknown special variable with appending linkage");
 }
 
 /// EmitLLVMUsedList - For targets that define a MAI::UsedDirective, mark each
diff --git a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
index 4ec966e..6213530 100644
--- a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
+++ b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
@@ -568,8 +568,29 @@ static void expandIToFP(Instruction *IToFP) {
   IToFP->eraseFromParent();
 }
 
+static void scalarize(Instruction *I, SmallVectorImpl<Instruction *> &Replace) {
+  VectorType *VTy = cast<FixedVectorType>(I->getType());
+
+  IRBuilder<> Builder(I);
+
+  unsigned NumElements = VTy->getElementCount().getFixedValue();
+  Value *Result = PoisonValue::get(VTy);
+  for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
+    Value *Ext = Builder.CreateExtractElement(I->getOperand(0), Idx);
+    Value *Cast = Builder.CreateCast(cast<CastInst>(I)->getOpcode(), Ext,
+                                     I->getType()->getScalarType());
+    Result = Builder.CreateInsertElement(Result, Cast, Idx);
+    if (isa<Instruction>(Cast))
+      Replace.push_back(cast<Instruction>(Cast));
+  }
+  I->replaceAllUsesWith(Result);
+  I->dropAllReferences();
+  I->eraseFromParent();
+}
+
 static bool runImpl(Function &F, const TargetLowering &TLI) {
   SmallVector<Instruction *, 4> Replace;
+  SmallVector<Instruction *, 4> ReplaceVector;
   bool Modified = false;
 
   unsigned MaxLegalFpConvertBitWidth =
@@ -584,29 +605,36 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
     switch (I.getOpcode()) {
     case Instruction::FPToUI:
     case Instruction::FPToSI: {
-      // TODO: This pass doesn't handle vectors.
-      if (I.getOperand(0)->getType()->isVectorTy())
+      // TODO: This pass doesn't handle scalable vectors.
+      if (I.getOperand(0)->getType()->isScalableTy())
         continue;
 
-      auto *IntTy = dyn_cast<IntegerType>(I.getType());
+      auto *IntTy = dyn_cast<IntegerType>(I.getType()->getScalarType());
       if (IntTy->getIntegerBitWidth() <= MaxLegalFpConvertBitWidth)
         continue;
 
-      Replace.push_back(&I);
+      if (I.getOperand(0)->getType()->isVectorTy())
+        ReplaceVector.push_back(&I);
+      else
+        Replace.push_back(&I);
       Modified = true;
       break;
     }
     case Instruction::UIToFP:
     case Instruction::SIToFP: {
-      // TODO: This pass doesn't handle vectors.
-      if (I.getOperand(0)->getType()->isVectorTy())
+      // TODO: This pass doesn't handle scalable vectors.
+      if (I.getOperand(0)->getType()->isScalableTy())
         continue;
 
-      auto *IntTy = dyn_cast<IntegerType>(I.getOperand(0)->getType());
+      auto *IntTy =
+          dyn_cast<IntegerType>(I.getOperand(0)->getType()->getScalarType());
       if (IntTy->getIntegerBitWidth() <= MaxLegalFpConvertBitWidth)
         continue;
 
-      Replace.push_back(&I);
+      if (I.getOperand(0)->getType()->isVectorTy())
+        ReplaceVector.push_back(&I);
+      else
+        Replace.push_back(&I);
       Modified = true;
       break;
     }
@@ -615,6 +643,11 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
     }
   }
 
+  while (!ReplaceVector.empty()) {
+    Instruction *I = ReplaceVector.pop_back_val();
+    scalarize(I, Replace);
+  }
+
   if (Replace.empty())
     return false;
 
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 5cf7a33..e53e35d 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5201,10 +5201,7 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) {
 
     // Calculate the multiplicative inverse modulo BW.
     // 2^W requires W + 1 bits, so we have to extend and then truncate.
-    unsigned W = Divisor.getBitWidth();
-    APInt Factor = Divisor.zext(W + 1)
-                       .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
-                       .trunc(W);
+    APInt Factor = Divisor.multiplicativeInverse();
     Shifts.push_back(MIB.buildConstant(ScalarShiftAmtTy, Shift).getReg(0));
     Factors.push_back(MIB.buildConstant(ScalarTy, Factor).getReg(0));
     return true;
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 797bbf7..95c6a35 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -3006,6 +3006,15 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     Observer.changedInstr(MI);
     return Legalized;
   }
+  case TargetOpcode::G_SPLAT_VECTOR: {
+    if (TypeIdx != 1)
+      return UnableToLegalize;
+
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
   }
 }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index b8ba782..6b35caf 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -1278,7 +1278,7 @@ MachineIRBuilder::buildInstr(unsigned Opc, ArrayRef<DstOp> DstOps,
         return DstTy.isScalar();
       else
         return DstTy.isVector() &&
-               DstTy.getNumElements() == Op0Ty.getNumElements();
+               DstTy.getElementCount() == Op0Ty.getElementCount();
     }() && "Type Mismatch");
     break;
   }
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index bbc6d39..bf3aee6 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -69,6 +69,8 @@ static cl::opt<bool> SimplifyMIR(
 static cl::opt<bool> PrintLocations("mir-debug-loc", cl::Hidden, cl::init(true),
                                     cl::desc("Print MIR debug-locations"));
 
+extern cl::opt<bool> WriteNewDbgInfoFormat;
+
 namespace {
 
 /// This structure describes how to print out stack object references.
@@ -986,29 +988,19 @@ void MIRFormatter::printIRValue(raw_ostream &OS, const Value &V,
 }
 
 void llvm::printMIR(raw_ostream &OS, const Module &M) {
-  // RemoveDIs: as there's no textual form for DbgRecords yet, print debug-info
-  // in dbg.value format.
-  bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
-  if (IsNewDbgInfoFormat)
-    const_cast<Module &>(M).convertFromNewDbgValues();
+  ScopedDbgInfoFormatSetter FormatSetter(const_cast<Module &>(M),
+                                         WriteNewDbgInfoFormat);
 
   yaml::Output Out(OS);
   Out << const_cast<Module &>(M);
-
-  if (IsNewDbgInfoFormat)
-    const_cast<Module &>(M).convertToNewDbgValues();
 }
 
 void llvm::printMIR(raw_ostream &OS, const MachineFunction &MF) {
   // RemoveDIs: as there's no textual form for DbgRecords yet, print debug-info
   // in dbg.value format.
-  bool IsNewDbgInfoFormat = MF.getFunction().IsNewDbgInfoFormat;
-  if (IsNewDbgInfoFormat)
-    const_cast<Function &>(MF.getFunction()).convertFromNewDbgValues();
+  ScopedDbgInfoFormatSetter FormatSetter(
+      const_cast<Function &>(MF.getFunction()), WriteNewDbgInfoFormat);
 
   MIRPrinter Printer(OS);
   Printer.print(MF);
-
-  if (IsNewDbgInfoFormat)
-    const_cast<Function &>(MF.getFunction()).convertToNewDbgValues();
 }
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index eb42a78..b9c6765 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -1268,7 +1268,7 @@ private:
   // Calculate the upper limit of each pressure set
   void computePressureSetLimit(const RegisterClassInfo &RCI) {
     for (unsigned PSet = 0; PSet < PSetNum; PSet++)
-      PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet);
+      PressureSetLimit[PSet] = TRI->getRegPressureSetLimit(MF, PSet);
 
     // We assume fixed registers, such as stack pointer, are already in use.
     // Therefore subtracting the weight of the fixed registers from the limit of
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2f46b23..f20080c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1164,19 +1164,20 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
   SDValue N01 = N0.getOperand(1);
 
   if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N01))) {
+    SDNodeFlags NewFlags;
+    if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() &&
+        Flags.hasNoUnsignedWrap())
+      NewFlags.setNoUnsignedWrap(true);
+
     if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) {
       // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))
       if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, {N01, N1}))
-        return DAG.getNode(Opc, DL, VT, N00, OpNode);
+        return DAG.getNode(Opc, DL, VT, N00, OpNode, NewFlags);
       return SDValue();
     }
     if (TLI.isReassocProfitable(DAG, N0, N1)) {
       // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1)
       //              iff (op x, c1) has one use
-      SDNodeFlags NewFlags;
-      if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() &&
-          Flags.hasNoUnsignedWrap())
-        NewFlags.setNoUnsignedWrap(true);
       SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N00, N1, NewFlags);
       return DAG.getNode(Opc, DL, VT, OpNode, N01, NewFlags);
     }
@@ -3053,17 +3054,15 @@ static SDValue foldAddSubMasked1(bool IsAdd, SDValue N0, SDValue N1,
 
 /// Helper for doing combines based on N0 and N1 being added to each other.
 SDValue DAGCombiner::visitADDLikeCommutative(SDValue N0, SDValue N1,
-                                          SDNode *LocReference) {
+                                             SDNode *LocReference) {
   EVT VT = N0.getValueType();
   SDLoc DL(LocReference);
 
   // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
-  if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB &&
-      isNullOrNullSplat(N1.getOperand(0).getOperand(0)))
+  SDValue Y, N;
+  if (sd_match(N1, m_Shl(m_Neg(m_Value(Y)), m_Value(N))))
     return DAG.getNode(ISD::SUB, DL, VT, N0,
-                       DAG.getNode(ISD::SHL, DL, VT,
-                                   N1.getOperand(0).getOperand(1),
-                                   N1.getOperand(1)));
+                       DAG.getNode(ISD::SHL, DL, VT, Y, N));
 
   if (SDValue V = foldAddSubMasked1(true, N0, N1, DAG, DL))
     return V;
@@ -12056,6 +12055,13 @@ SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) {
 }
 
 SDValue DAGCombiner::visitVP_SELECT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+
+  if (SDValue V = DAG.simplifySelect(N0, N1, N2))
+    return V;
+
   if (SDValue V = foldBoolSelectToLogic<VPMatchContext>(N, DAG))
     return V;
 
@@ -22260,12 +22266,6 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
       IndexC->getAPIntValue().uge(VecVT.getVectorNumElements()))
     return DAG.getUNDEF(ScalarVT);
 
-  // extract_vector_elt(freeze(x)), idx -> freeze(extract_vector_elt(x)), idx
-  if (VecOp.hasOneUse() && VecOp.getOpcode() == ISD::FREEZE) {
-    return DAG.getFreeze(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT,
-                                     VecOp.getOperand(0), Index));
-  }
-
   // extract_vector_elt (build_vector x, y), 1 -> y
   if (((IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR) ||
        VecOp.getOpcode() == ISD::SPLAT_VECTOR) &&
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 3332c02..a8b1f41 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -53,6 +53,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
   SDValue R = SDValue();
 
   switch (N->getOpcode()) {
+    // clang-format off
   default:
 #ifndef NDEBUG
     dbgs() << "SoftenFloatResult #" << ResNo << ": ";
@@ -115,9 +116,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FPOWI:
     case ISD::FLDEXP:
     case ISD::STRICT_FLDEXP: R = SoftenFloatRes_ExpOp(N); break;
-    case ISD::FFREXP:
-      R = SoftenFloatRes_FFREXP(N);
-      break;
+    case ISD::FFREXP:        R = SoftenFloatRes_FFREXP(N); break;
     case ISD::STRICT_FREM:
     case ISD::FREM:        R = SoftenFloatRes_FREM(N); break;
     case ISD::STRICT_FRINT:
@@ -150,14 +149,11 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::VECREDUCE_FMIN:
     case ISD::VECREDUCE_FMAX:
     case ISD::VECREDUCE_FMAXIMUM:
-    case ISD::VECREDUCE_FMINIMUM:
-      R = SoftenFloatRes_VECREDUCE(N);
-      break;
+    case ISD::VECREDUCE_FMINIMUM: R = SoftenFloatRes_VECREDUCE(N); break;
     case ISD::VECREDUCE_SEQ_FADD:
-    case ISD::VECREDUCE_SEQ_FMUL:
-      R = SoftenFloatRes_VECREDUCE_SEQ(N);
-      break;
-  }
+    case ISD::VECREDUCE_SEQ_FMUL: R = SoftenFloatRes_VECREDUCE_SEQ(N); break;
+      // clang-format on
+    }
 
   // If R is null, the sub-method took care of registering the result.
   if (R.getNode()) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 8c543ae..1dd0fa4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5149,6 +5149,17 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::OR:
     return ConsiderFlags && Op->getFlags().hasDisjoint();
 
+  case ISD::SCALAR_TO_VECTOR:
+    // Check if we demand any upper (undef) elements.
+    return !PoisonOnly && DemandedElts.ugt(1);
+
+  case ISD::EXTRACT_VECTOR_ELT: {
+    // Ensure that the element index is in bounds.
+    EVT VecVT = Op.getOperand(0).getValueType();
+    KnownBits KnownIdx = computeKnownBits(Op.getOperand(1), Depth + 1);
+    return KnownIdx.getMaxValue().uge(VecVT.getVectorMinNumElements());
+  }
+
   case ISD::INSERT_VECTOR_ELT:{
     // Ensure that the element index is in bounds.
     EVT VecVT = Op.getOperand(0).getValueType();
@@ -11545,30 +11556,32 @@ bool llvm::isNeutralConstant(unsigned Opcode, SDNodeFlags Flags, SDValue V,
                              unsigned OperandNo) {
   // NOTE: The cases should match with IR's ConstantExpr::getBinOpIdentity().
   // TODO: Target-specific opcodes could be added.
-  if (auto *Const = isConstOrConstSplat(V)) {
+  if (auto *ConstV = isConstOrConstSplat(V, /*AllowUndefs*/ false,
+                                         /*AllowTruncation*/ true)) {
+    APInt Const = ConstV->getAPIntValue().trunc(V.getScalarValueSizeInBits());
     switch (Opcode) {
     case ISD::ADD:
     case ISD::OR:
     case ISD::XOR:
     case ISD::UMAX:
-      return Const->isZero();
+      return Const.isZero();
     case ISD::MUL:
-      return Const->isOne();
+      return Const.isOne();
     case ISD::AND:
     case ISD::UMIN:
-      return Const->isAllOnes();
+      return Const.isAllOnes();
     case ISD::SMAX:
-      return Const->isMinSignedValue();
+      return Const.isMinSignedValue();
     case ISD::SMIN:
-      return Const->isMaxSignedValue();
+      return Const.isMaxSignedValue();
     case ISD::SUB:
     case ISD::SHL:
     case ISD::SRA:
     case ISD::SRL:
-      return OperandNo == 1 && Const->isZero();
+      return OperandNo == 1 && Const.isZero();
     case ISD::UDIV:
     case ISD::SDIV:
-      return OperandNo == 1 && Const->isOne();
+      return OperandNo == 1 && Const.isOne();
     }
   } else if (auto *ConstFP = isConstOrConstSplatFP(V)) {
     switch (Opcode) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 20375a0..6691aa4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -456,6 +456,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::CONVERGENCECTRL_ANCHOR:     return "convergencectrl_anchor";
   case ISD::CONVERGENCECTRL_ENTRY:      return "convergencectrl_entry";
   case ISD::CONVERGENCECTRL_LOOP:       return "convergencectrl_loop";
+  case ISD::CONVERGENCECTRL_GLUE:       return "convergencectrl_glue";
 
   // Bit manipulation
   case ISD::ABS:                        return "abs";
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 962f0d9..409d66a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -742,6 +742,13 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
 
     break;
   }
+  case ISD::FREEZE: {
+    SDValue N0 = Op.getOperand(0);
+    if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts,
+                                             /*PoisonOnly=*/false))
+      return N0;
+    break;
+  }
   case ISD::AND: {
     LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
     RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
@@ -3184,6 +3191,20 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     }
     break;
   }
+  case ISD::FREEZE: {
+    SDValue N0 = Op.getOperand(0);
+    if (TLO.DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts,
+                                                 /*PoisonOnly=*/false))
+      return TLO.CombineTo(Op, N0);
+
+    // TODO: Replace this with the general fold from DAGCombiner::visitFREEZE
+    // freeze(op(x, ...)) -> op(freeze(x), ...).
+    if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR && DemandedElts == 1)
+      return TLO.CombineTo(
+          Op, TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
+                              TLO.DAG.getFreeze(N0.getOperand(0))));
+    break;
+  }
   case ISD::BUILD_VECTOR: {
     // Check all elements and simplify any unused elements with UNDEF.
     if (!DemandedElts.isAllOnes()) {
@@ -3524,6 +3545,10 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     }
     [[fallthrough]];
   }
+  case ISD::AVGCEILS:
+  case ISD::AVGCEILU:
+  case ISD::AVGFLOORS:
+  case ISD::AVGFLOORU:
   case ISD::OR:
   case ISD::XOR:
   case ISD::SUB:
@@ -6046,11 +6071,7 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N,
       Divisor.ashrInPlace(Shift);
       UseSRA = true;
     }
-    // Calculate the multiplicative inverse, using Newton's method.
-    APInt t;
-    APInt Factor = Divisor;
-    while ((t = Divisor * Factor) != 1)
-      Factor *= APInt(Divisor.getBitWidth(), 2) - t;
+    APInt Factor = Divisor.multiplicativeInverse();
     Shifts.push_back(DAG.getConstant(Shift, dl, ShSVT));
     Factors.push_back(DAG.getConstant(Factor, dl, SVT));
     return true;
@@ -6639,10 +6660,7 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
     // P = inv(D0, 2^W)
     // 2^W requires W + 1 bits, so we have to extend and then truncate.
     unsigned W = D.getBitWidth();
-    APInt P = D0.zext(W + 1)
-                  .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
-                  .trunc(W);
-    assert(!P.isZero() && "No multiplicative inverse!"); // unreachable
+    APInt P = D0.multiplicativeInverse();
     assert((D0 * P).isOne() && "Multiplicative inverse basic check failed.");
 
     // Q = floor((2^W - 1) u/ D)
@@ -6897,10 +6915,7 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
     // P = inv(D0, 2^W)
     // 2^W requires W + 1 bits, so we have to extend and then truncate.
     unsigned W = D.getBitWidth();
-    APInt P = D0.zext(W + 1)
-                  .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
-                  .trunc(W);
-    assert(!P.isZero() && "No multiplicative inverse!"); // unreachable
+    APInt P = D0.multiplicativeInverse();
     assert((D0 * P).isOne() && "Multiplicative inverse basic check failed.");
 
     // A = floor((2^(W - 1) - 1) / D0) & -2^K
@@ -7626,7 +7641,7 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
 //
 // For division, we can compute the remainder using the algorithm described
 // above, subtract it from the dividend to get an exact multiple of Constant.
-// Then multiply that extact multiply by the multiplicative inverse modulo
+// Then multiply that exact multiply by the multiplicative inverse modulo
 // (1 << (BitWidth / 2)) to get the quotient.
 
 // If Constant is even, we can shift right the dividend and the divisor by the
@@ -7761,10 +7776,7 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
 
     // Multiply by the multiplicative inverse of the divisor modulo
     // (1 << BitWidth).
-    APInt Mod = APInt::getSignedMinValue(BitWidth + 1);
-    APInt MulFactor = Divisor.zext(BitWidth + 1);
-    MulFactor = MulFactor.multiplicativeInverse(Mod);
-    MulFactor = MulFactor.trunc(BitWidth);
+    APInt MulFactor = Divisor.multiplicativeInverse();
 
     SDValue Quotient = DAG.getNode(ISD::MUL, dl, VT, Dividend,
                                    DAG.getConstant(MulFactor, dl, VT));
diff --git a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp
index ab57d08..a4b2299 100644
--- a/llvm/lib/CodeGen/ShrinkWrap.cpp
+++ b/llvm/lib/CodeGen/ShrinkWrap.cpp
@@ -161,9 +161,11 @@ class ShrinkWrap : public MachineFunctionPass {
   /// Current MachineFunction.
   MachineFunction *MachineFunc = nullptr;
 
-  /// Is `true` for block numbers where we can guarantee no stack access
-  /// or computation of stack-relative addresses on any CFG path including
-  /// the block itself.
+  /// Is `true` for the block numbers where we assume possible stack accesses
+  /// or computation of stack-relative addresses on any CFG path including the
+  /// block itself. Is `false` for basic blocks where we can guarantee the
+  /// opposite. False positives won't lead to incorrect analysis results,
+  /// therefore this approach is fair.
   BitVector StackAddressUsedBlockInfo;
 
   /// Check if \p MI uses or defines a callee-saved register or
@@ -948,6 +950,9 @@ bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
 
   bool Changed = false;
 
+  // Initially, conservatively assume that stack addresses can be used in each
+  // basic block and change the state only for those basic blocks for which we
+  // were able to prove the opposite.
   StackAddressUsedBlockInfo.resize(MF.getNumBlockIDs(), true);
   bool HasCandidate = performShrinkWrapping(RPOT, RS.get());
   StackAddressUsedBlockInfo.clear();
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index a44f6af..0f8c984 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -983,7 +983,8 @@ static Intrinsic::ID shouldUpgradeNVPTXBF16Intrinsic(StringRef Name) {
   return Intrinsic::not_intrinsic;
 }
 
-static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
+static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
+                                      bool CanUpgradeDebugIntrinsicsToRecords) {
   assert(F && "Illegal to upgrade a non-existent Function.");
 
   StringRef Name = F->getName();
@@ -1057,7 +1058,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   case 'd':
     if (Name.consume_front("dbg.")) {
       // Mark debug intrinsics for upgrade to new debug format.
-      if (F->getParent()->IsNewDbgInfoFormat) {
+      if (CanUpgradeDebugIntrinsicsToRecords &&
+          F->getParent()->IsNewDbgInfoFormat) {
         if (Name == "addr" || Name == "value" || Name == "assign" ||
             Name == "declare" || Name == "label") {
           // There's no function to replace these with.
@@ -1413,9 +1415,11 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   return false;
 }
 
-bool llvm::UpgradeIntrinsicFunction(Function *F, Function *&NewFn) {
+bool llvm::UpgradeIntrinsicFunction(Function *F, Function *&NewFn,
+                                    bool CanUpgradeDebugIntrinsicsToRecords) {
   NewFn = nullptr;
-  bool Upgraded = upgradeIntrinsicFunction1(F, NewFn);
+  bool Upgraded =
+      upgradeIntrinsicFunction1(F, NewFn, CanUpgradeDebugIntrinsicsToRecords);
   assert(F != NewFn && "Intrinsic function upgraded to the same function");
 
   // Upgrade intrinsic attributes.  This does not change the function.
@@ -2412,6 +2416,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
   Builder.SetInsertPoint(CI->getParent(), CI->getIterator());
 
   if (!NewFn) {
+    bool FallthroughToDefaultUpgrade = false;
     // Get the Function's name.
     StringRef Name = F->getName();
 
@@ -4262,16 +4267,30 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = upgradeARMIntrinsicCall(Name, CI, F, Builder);
     } else if (IsAMDGCN) {
       Rep = upgradeAMDGCNIntrinsicCall(Name, CI, F, Builder);
-    } else if (IsDbg && CI->getModule()->IsNewDbgInfoFormat) {
-      upgradeDbgIntrinsicToDbgRecord(Name, CI);
+    } else if (IsDbg) {
+      // We might have decided we don't want the new format after all between
+      // first requesting the upgrade and now; skip the conversion if that is
+      // the case, and check here to see if the intrinsic needs to be upgraded
+      // normally.
+      if (!CI->getModule()->IsNewDbgInfoFormat) {
+        bool NeedsUpgrade =
+            upgradeIntrinsicFunction1(CI->getCalledFunction(), NewFn, false);
+        if (!NeedsUpgrade)
+          return;
+        FallthroughToDefaultUpgrade = true;
+      } else {
+        upgradeDbgIntrinsicToDbgRecord(Name, CI);
+      }
     } else {
       llvm_unreachable("Unknown function for CallBase upgrade.");
     }
 
-    if (Rep)
-      CI->replaceAllUsesWith(Rep);
-    CI->eraseFromParent();
-    return;
+    if (!FallthroughToDefaultUpgrade) {
+      if (Rep)
+        CI->replaceAllUsesWith(Rep);
+      CI->eraseFromParent();
+      return;
+    }
   }
 
   const auto &DefaultCase = [&]() -> void {
diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp
index 953f21c..d361bd9 100644
--- a/llvm/lib/IR/LegacyPassManager.cpp
+++ b/llvm/lib/IR/LegacyPassManager.cpp
@@ -531,9 +531,7 @@ bool PassManagerImpl::run(Module &M) {
   // RemoveDIs: if a command line flag is given, convert to the
   // DbgVariableRecord representation of debug-info for the duration of these
   // passes.
-  bool shouldConvertDbgInfo = UseNewDbgInfoFormat && !M.IsNewDbgInfoFormat;
-  if (shouldConvertDbgInfo)
-    M.convertToNewDbgValues();
+  ScopedDbgInfoFormatSetter FormatSetter(M, UseNewDbgInfoFormat);
 
   for (ImmutablePass *ImPass : getImmutablePasses())
     Changed |= ImPass->doInitialization(M);
@@ -547,9 +545,6 @@ bool PassManagerImpl::run(Module &M) {
   for (ImmutablePass *ImPass : getImmutablePasses())
     Changed |= ImPass->doFinalization(M);
 
-  if (shouldConvertDbgInfo)
-    M.convertFromNewDbgValues();
-
   return Changed;
 }
 } // namespace legacy
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 33f3584..64c5991 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -1734,8 +1734,28 @@ void Verifier::visitModuleFlags() {
   // Scan each flag, and track the flags and requirements.
   DenseMap<const MDString*, const MDNode*> SeenIDs;
   SmallVector<const MDNode*, 16> Requirements;
-  for (const MDNode *MDN : Flags->operands())
+  uint64_t PAuthABIPlatform = -1;
+  uint64_t PAuthABIVersion = -1;
+  for (const MDNode *MDN : Flags->operands()) {
     visitModuleFlag(MDN, SeenIDs, Requirements);
+    if (MDN->getNumOperands() != 3)
+      continue;
+    if (const auto *FlagName = dyn_cast_or_null<MDString>(MDN->getOperand(1))) {
+      if (FlagName->getString() == "aarch64-elf-pauthabi-platform") {
+        if (const auto *PAP =
+                mdconst::dyn_extract_or_null<ConstantInt>(MDN->getOperand(2)))
+          PAuthABIPlatform = PAP->getZExtValue();
+      } else if (FlagName->getString() == "aarch64-elf-pauthabi-version") {
+        if (const auto *PAV =
+                mdconst::dyn_extract_or_null<ConstantInt>(MDN->getOperand(2)))
+          PAuthABIVersion = PAV->getZExtValue();
+      }
+    }
+  }
+
+  if ((PAuthABIPlatform == uint64_t(-1)) != (PAuthABIVersion == uint64_t(-1)))
+    CheckFailed("either both or no 'aarch64-elf-pauthabi-platform' and "
+                "'aarch64-elf-pauthabi-version' module flags must be present");
 
   // Validate that the requirements in the module are valid.
   for (const MDNode *Requirement : Requirements) {
@@ -4343,6 +4363,11 @@ void Verifier::visitEHPadPredecessors(Instruction &I) {
     if (auto *II = dyn_cast<InvokeInst>(TI)) {
       Check(II->getUnwindDest() == BB && II->getNormalDest() != BB,
             "EH pad must be jumped to via an unwind edge", ToPad, II);
+      auto *CalledFn =
+          dyn_cast<Function>(II->getCalledOperand()->stripPointerCasts());
+      if (CalledFn && CalledFn->isIntrinsic() && II->doesNotThrow() &&
+          !IntrinsicInst::mayLowerToFunctionCall(CalledFn->getIntrinsicID()))
+        continue;
       if (auto Bundle = II->getOperandBundle(LLVMContext::OB_funclet))
         FromPad = Bundle->Inputs[0];
       else
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index a7e6db8..7a5aa0c 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -1548,25 +1548,10 @@ Error IRLinker::run() {
       return Err;
 
   // Convert source module to match dest for the duration of the link.
-  bool SrcModuleNewDbgFormat = SrcM->IsNewDbgInfoFormat;
-  if (DstM.IsNewDbgInfoFormat != SrcM->IsNewDbgInfoFormat) {
-    if (DstM.IsNewDbgInfoFormat)
-      SrcM->convertToNewDbgValues();
-    else
-      SrcM->convertFromNewDbgValues();
-  }
-  // Undo debug mode conversion afterwards.
-  auto Cleanup = make_scope_exit([&]() {
-    if (SrcModuleNewDbgFormat != SrcM->IsNewDbgInfoFormat) {
-      if (SrcModuleNewDbgFormat)
-        SrcM->convertToNewDbgValues();
-      else
-        SrcM->convertFromNewDbgValues();
-    }
-  });
+  ScopedDbgInfoFormatSetter FormatSetter(*SrcM, DstM.IsNewDbgInfoFormat);
 
-  // Inherit the target data from the source module if the destination module
-  // doesn't have one already.
+  // Inherit the target data from the source module if the destination
+  // module doesn't have one already.
   if (DstM.getDataLayout().isDefault())
     DstM.setDataLayout(SrcM->getDataLayout());
 
diff --git a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
index 205bc1e..f343d14 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
@@ -215,23 +215,41 @@ static Error dumpSectionToFile(StringRef SecName, StringRef Filename,
 }
 
 Error Object::compressOrDecompressSections(const CommonConfig &Config) {
-  // Build a list of the debug sections we are going to replace.
-  // We can't call `AddSection` while iterating over sections,
+  // Build a list of sections we are going to replace.
+  // We can't call `addSection` while iterating over sections,
   // because it would mutate the sections array.
   SmallVector<std::pair<SectionBase *, std::function<SectionBase *()>>, 0>
       ToReplace;
   for (SectionBase &Sec : sections()) {
-    if ((Sec.Flags & SHF_ALLOC) || !StringRef(Sec.Name).starts_with(".debug"))
+    std::optional<DebugCompressionType> CType;
+    for (auto &[Matcher, T] : Config.compressSections)
+      if (Matcher.matches(Sec.Name))
+        CType = T;
+    // Handle --compress-debug-sections and --decompress-debug-sections, which
+    // apply to non-ALLOC debug sections.
+    if (!(Sec.Flags & SHF_ALLOC) && StringRef(Sec.Name).starts_with(".debug")) {
+      if (Config.CompressionType != DebugCompressionType::None)
+        CType = Config.CompressionType;
+      else if (Config.DecompressDebugSections)
+        CType = DebugCompressionType::None;
+    }
+    if (!CType)
       continue;
+
+    if (Sec.ParentSegment)
+      return createStringError(
+          errc::invalid_argument,
+          "section '" + Sec.Name +
+              "' within a segment cannot be (de)compressed");
+
     if (auto *CS = dyn_cast<CompressedSection>(&Sec)) {
-      if (Config.DecompressDebugSections) {
+      if (*CType == DebugCompressionType::None)
         ToReplace.emplace_back(
             &Sec, [=] { return &addSection<DecompressedSection>(*CS); });
-      }
-    } else if (Config.CompressionType != DebugCompressionType::None) {
-      ToReplace.emplace_back(&Sec, [&, S = &Sec] {
+    } else if (*CType != DebugCompressionType::None) {
+      ToReplace.emplace_back(&Sec, [=, S = &Sec] {
         return &addSection<CompressedSection>(
-            CompressedSection(*S, Config.CompressionType, Is64Bits));
+            CompressedSection(*S, *CType, Is64Bits));
       });
     }
   }
diff --git a/llvm/lib/Object/COFFObjectFile.cpp b/llvm/lib/Object/COFFObjectFile.cpp
index 87009126..18506f3 100644
--- a/llvm/lib/Object/COFFObjectFile.cpp
+++ b/llvm/lib/Object/COFFObjectFile.cpp
@@ -14,18 +14,17 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/BinaryFormat/COFF.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Object/Error.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/WindowsMachineFlag.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBufferRef.h"
-#include "llvm/TargetParser/Triple.h"
 #include <algorithm>
 #include <cassert>
 #include <cinttypes>
@@ -1072,20 +1071,7 @@ StringRef COFFObjectFile::getFileFormatName() const {
 }
 
 Triple::ArchType COFFObjectFile::getArch() const {
-  switch (getMachine()) {
-  case COFF::IMAGE_FILE_MACHINE_I386:
-    return Triple::x86;
-  case COFF::IMAGE_FILE_MACHINE_AMD64:
-    return Triple::x86_64;
-  case COFF::IMAGE_FILE_MACHINE_ARMNT:
-    return Triple::thumb;
-  case COFF::IMAGE_FILE_MACHINE_ARM64:
-  case COFF::IMAGE_FILE_MACHINE_ARM64EC:
-  case COFF::IMAGE_FILE_MACHINE_ARM64X:
-    return Triple::aarch64;
-  default:
-    return Triple::UnknownArch;
-  }
+  return getMachineArchType(getMachine());
 }
 
 Expected<uint64_t> COFFObjectFile::getStartAddress() const {
@@ -1320,8 +1306,8 @@ COFFObjectFile::getRelocations(const coff_section *Sec) const {
     return #reloc_type;
 
 StringRef COFFObjectFile::getRelocationTypeName(uint16_t Type) const {
-  switch (getMachine()) {
-  case COFF::IMAGE_FILE_MACHINE_AMD64:
+  switch (getArch()) {
+  case Triple::x86_64:
     switch (Type) {
     LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_ABSOLUTE);
     LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_ADDR64);
@@ -1344,7 +1330,7 @@ StringRef COFFObjectFile::getRelocationTypeName(uint16_t Type) const {
       return "Unknown";
     }
     break;
-  case COFF::IMAGE_FILE_MACHINE_ARMNT:
+  case Triple::thumb:
     switch (Type) {
     LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM_ABSOLUTE);
     LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM_ADDR32);
@@ -1367,9 +1353,7 @@ StringRef COFFObjectFile::getRelocationTypeName(uint16_t Type) const {
       return "Unknown";
     }
     break;
-  case COFF::IMAGE_FILE_MACHINE_ARM64:
-  case COFF::IMAGE_FILE_MACHINE_ARM64EC:
-  case COFF::IMAGE_FILE_MACHINE_ARM64X:
+  case Triple::aarch64:
     switch (Type) {
     LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_ABSOLUTE);
     LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_ADDR32);
@@ -1393,7 +1377,7 @@ StringRef COFFObjectFile::getRelocationTypeName(uint16_t Type) const {
       return "Unknown";
     }
     break;
-  case COFF::IMAGE_FILE_MACHINE_I386:
+  case Triple::x86:
     switch (Type) {
     LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_ABSOLUTE);
     LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_DIR16);
@@ -1941,19 +1925,17 @@ ResourceSectionRef::getContents(const coff_resource_data_entry &Entry) {
     // the expected type.
     const coff_relocation &R = **RelocsForOffset.first;
     uint16_t RVAReloc;
-    switch (Obj->getMachine()) {
-    case COFF::IMAGE_FILE_MACHINE_I386:
+    switch (Obj->getArch()) {
+    case Triple::x86:
       RVAReloc = COFF::IMAGE_REL_I386_DIR32NB;
       break;
-    case COFF::IMAGE_FILE_MACHINE_AMD64:
+    case Triple::x86_64:
       RVAReloc = COFF::IMAGE_REL_AMD64_ADDR32NB;
       break;
-    case COFF::IMAGE_FILE_MACHINE_ARMNT:
+    case Triple::thumb:
       RVAReloc = COFF::IMAGE_REL_ARM_ADDR32NB;
       break;
-    case COFF::IMAGE_FILE_MACHINE_ARM64:
-    case COFF::IMAGE_FILE_MACHINE_ARM64EC:
-    case COFF::IMAGE_FILE_MACHINE_ARM64X:
+    case Triple::aarch64:
       RVAReloc = COFF::IMAGE_REL_ARM64_ADDR32NB;
       break;
     default:
diff --git a/llvm/lib/Object/WindowsResource.cpp b/llvm/lib/Object/WindowsResource.cpp
index 61ca49e..983c8e3 100644
--- a/llvm/lib/Object/WindowsResource.cpp
+++ b/llvm/lib/Object/WindowsResource.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/Object/WindowsResource.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Object/WindowsMachineFlag.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ScopedPrinter.h"
@@ -978,19 +979,17 @@ void WindowsResourceCOFFWriter::writeFirstSectionRelocations() {
         reinterpret_cast<coff_relocation *>(BufferStart + CurrentOffset);
     Reloc->VirtualAddress = RelocationAddresses[i];
     Reloc->SymbolTableIndex = NextSymbolIndex++;
-    switch (MachineType) {
-    case COFF::IMAGE_FILE_MACHINE_ARMNT:
+    switch (getMachineArchType(MachineType)) {
+    case Triple::thumb:
       Reloc->Type = COFF::IMAGE_REL_ARM_ADDR32NB;
       break;
-    case COFF::IMAGE_FILE_MACHINE_AMD64:
+    case Triple::x86_64:
       Reloc->Type = COFF::IMAGE_REL_AMD64_ADDR32NB;
       break;
-    case COFF::IMAGE_FILE_MACHINE_I386:
+    case Triple::x86:
       Reloc->Type = COFF::IMAGE_REL_I386_DIR32NB;
       break;
-    case COFF::IMAGE_FILE_MACHINE_ARM64:
-    case COFF::IMAGE_FILE_MACHINE_ARM64EC:
-    case COFF::IMAGE_FILE_MACHINE_ARM64X:
+    case Triple::aarch64:
       Reloc->Type = COFF::IMAGE_REL_ARM64_ADDR32NB;
       break;
     default:
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 7ac5c56..884334e 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -1303,7 +1303,7 @@ Error IndexedInstrProfReader::readHeader() {
     MemProfRecordTable.reset(MemProfRecordHashTable::Create(
         /*Buckets=*/Start + RecordTableOffset,
         /*Payload=*/Ptr,
-        /*Base=*/Start, memprof::RecordLookupTrait(Schema)));
+        /*Base=*/Start, memprof::RecordLookupTrait(memprof::Version1, Schema)));
 
     // Initialize the frame table reader with the payload and bucket offsets.
     MemProfFrameTable.reset(MemProfFrameHashTable::Create(
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index c2c94ba..72d77d5 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -414,6 +414,144 @@ static void setSummary(IndexedInstrProf::Summary *TheSummary,
     TheSummary->setEntry(I, Res[I]);
 }
 
+// Serialize Schema.
+static void writeMemProfSchema(ProfOStream &OS,
+                               const memprof::MemProfSchema &Schema) {
+  OS.write(static_cast<uint64_t>(Schema.size()));
+  for (const auto Id : Schema)
+    OS.write(static_cast<uint64_t>(Id));
+}
+
+// Serialize MemProfRecordData.  Return RecordTableOffset.
+static uint64_t writeMemProfRecords(
+    ProfOStream &OS,
+    llvm::MapVector<GlobalValue::GUID, memprof::IndexedMemProfRecord>
+        &MemProfRecordData,
+    memprof::MemProfSchema *Schema) {
+  auto RecordWriter =
+      std::make_unique<memprof::RecordWriterTrait>(memprof::Version1);
+  RecordWriter->Schema = Schema;
+  OnDiskChainedHashTableGenerator<memprof::RecordWriterTrait>
+      RecordTableGenerator;
+  for (auto &I : MemProfRecordData) {
+    // Insert the key (func hash) and value (memprof record).
+    RecordTableGenerator.insert(I.first, I.second, *RecordWriter.get());
+  }
+  // Release the memory of this MapVector as it is no longer needed.
+  MemProfRecordData.clear();
+
+  // The call to Emit invokes RecordWriterTrait::EmitData which destructs
+  // the memprof record copies owned by the RecordTableGenerator. This works
+  // because the RecordTableGenerator is not used after this point.
+  return RecordTableGenerator.Emit(OS.OS, *RecordWriter);
+}
+
+// Serialize MemProfFrameData.  Return FrameTableOffset.
+static uint64_t writeMemProfFrames(
+    ProfOStream &OS,
+    llvm::MapVector<memprof::FrameId, memprof::Frame> &MemProfFrameData) {
+  auto FrameWriter = std::make_unique<memprof::FrameWriterTrait>();
+  OnDiskChainedHashTableGenerator<memprof::FrameWriterTrait>
+      FrameTableGenerator;
+  for (auto &I : MemProfFrameData) {
+    // Insert the key (frame id) and value (frame contents).
+    FrameTableGenerator.insert(I.first, I.second);
+  }
+  // Release the memory of this MapVector as it is no longer needed.
+  MemProfFrameData.clear();
+
+  return FrameTableGenerator.Emit(OS.OS, *FrameWriter);
+}
+
+static Error writeMemProfV0(
+    ProfOStream &OS,
+    llvm::MapVector<GlobalValue::GUID, memprof::IndexedMemProfRecord>
+        &MemProfRecordData,
+    llvm::MapVector<memprof::FrameId, memprof::Frame> &MemProfFrameData) {
+  uint64_t HeaderUpdatePos = OS.tell();
+  OS.write(0ULL); // Reserve space for the memprof record table offset.
+  OS.write(0ULL); // Reserve space for the memprof frame payload offset.
+  OS.write(0ULL); // Reserve space for the memprof frame table offset.
+
+  auto Schema = memprof::PortableMemInfoBlock::getSchema();
+  writeMemProfSchema(OS, Schema);
+
+  uint64_t RecordTableOffset =
+      writeMemProfRecords(OS, MemProfRecordData, &Schema);
+
+  uint64_t FramePayloadOffset = OS.tell();
+  uint64_t FrameTableOffset = writeMemProfFrames(OS, MemProfFrameData);
+
+  uint64_t Header[] = {RecordTableOffset, FramePayloadOffset, FrameTableOffset};
+  OS.patch({{HeaderUpdatePos, Header, std::size(Header)}});
+
+  return Error::success();
+}
+
+static Error writeMemProfV1(
+    ProfOStream &OS,
+    llvm::MapVector<GlobalValue::GUID, memprof::IndexedMemProfRecord>
+        &MemProfRecordData,
+    llvm::MapVector<memprof::FrameId, memprof::Frame> &MemProfFrameData) {
+  OS.write(memprof::Version0);
+  uint64_t HeaderUpdatePos = OS.tell();
+  OS.write(0ULL); // Reserve space for the memprof record table offset.
+  OS.write(0ULL); // Reserve space for the memprof frame payload offset.
+  OS.write(0ULL); // Reserve space for the memprof frame table offset.
+
+  auto Schema = memprof::PortableMemInfoBlock::getSchema();
+  writeMemProfSchema(OS, Schema);
+
+  uint64_t RecordTableOffset =
+      writeMemProfRecords(OS, MemProfRecordData, &Schema);
+
+  uint64_t FramePayloadOffset = OS.tell();
+  uint64_t FrameTableOffset = writeMemProfFrames(OS, MemProfFrameData);
+
+  uint64_t Header[] = {RecordTableOffset, FramePayloadOffset, FrameTableOffset};
+  OS.patch({{HeaderUpdatePos, Header, std::size(Header)}});
+
+  return Error::success();
+}
+
+// The MemProf profile data includes a simple schema
+// with the format described below followed by the hashtable:
+// uint64_t Version
+// uint64_t RecordTableOffset = RecordTableGenerator.Emit
+// uint64_t FramePayloadOffset = Stream offset before emitting the frame table
+// uint64_t FrameTableOffset = FrameTableGenerator.Emit
+// uint64_t Num schema entries
+// uint64_t Schema entry 0
+// uint64_t Schema entry 1
+// ....
+// uint64_t Schema entry N - 1
+// OnDiskChainedHashTable MemProfRecordData
+// OnDiskChainedHashTable MemProfFrameData
+static Error writeMemProf(
+    ProfOStream &OS,
+    llvm::MapVector<GlobalValue::GUID, memprof::IndexedMemProfRecord>
+        &MemProfRecordData,
+    llvm::MapVector<memprof::FrameId, memprof::Frame> &MemProfFrameData,
+    memprof::IndexedVersion MemProfVersionRequested) {
+
+  switch (MemProfVersionRequested) {
+  case memprof::Version0:
+    return writeMemProfV0(OS, MemProfRecordData, MemProfFrameData);
+  case memprof::Version1:
+    return writeMemProfV1(OS, MemProfRecordData, MemProfFrameData);
+  case memprof::Version2:
+    // TODO: Implement.  Fall through to the error handling below for now.
+    break;
+  }
+
+  return make_error<InstrProfError>(
+      instrprof_error::unsupported_version,
+      formatv("MemProf version {} not supported; "
+              "requires version between {} and {}, inclusive",
+              MemProfVersionRequested, memprof::MinimumSupportedVersion,
+              memprof::MaximumSupportedVersion));
+}
+
 Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   using namespace IndexedInstrProf;
   using namespace support;
@@ -517,84 +655,13 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   // Write the hash table.
   uint64_t HashTableStart = Generator.Emit(OS.OS, *InfoObj);
 
-  // Write the MemProf profile data if we have it. This includes a simple schema
-  // with the format described below followed by the hashtable:
-  // uint64_t Version
-  // uint64_t RecordTableOffset = RecordTableGenerator.Emit
-  // uint64_t FramePayloadOffset = Stream offset before emitting the frame table
-  // uint64_t FrameTableOffset = FrameTableGenerator.Emit
-  // uint64_t Num schema entries
-  // uint64_t Schema entry 0
-  // uint64_t Schema entry 1
-  // ....
-  // uint64_t Schema entry N - 1
-  // OnDiskChainedHashTable MemProfRecordData
-  // OnDiskChainedHashTable MemProfFrameData
+  // Write the MemProf profile data if we have it.
   uint64_t MemProfSectionStart = 0;
   if (static_cast<bool>(ProfileKind & InstrProfKind::MemProf)) {
-    if (MemProfVersionRequested < memprof::MinimumSupportedVersion ||
-        MemProfVersionRequested > memprof::MaximumSupportedVersion) {
-      return make_error<InstrProfError>(
-          instrprof_error::unsupported_version,
-          formatv("MemProf version {} not supported; "
-                  "requires version between {} and {}, inclusive",
-                  MemProfVersionRequested, memprof::MinimumSupportedVersion,
-                  memprof::MaximumSupportedVersion));
-    }
-
     MemProfSectionStart = OS.tell();
-
-    if (MemProfVersionRequested >= memprof::Version1)
-      OS.write(MemProfVersionRequested);
-
-    OS.write(0ULL); // Reserve space for the memprof record table offset.
-    OS.write(0ULL); // Reserve space for the memprof frame payload offset.
-    OS.write(0ULL); // Reserve space for the memprof frame table offset.
-
-    auto Schema = memprof::PortableMemInfoBlock::getSchema();
-    OS.write(static_cast<uint64_t>(Schema.size()));
-    for (const auto Id : Schema) {
-      OS.write(static_cast<uint64_t>(Id));
-    }
-
-    auto RecordWriter = std::make_unique<memprof::RecordWriterTrait>();
-    RecordWriter->Schema = &Schema;
-    OnDiskChainedHashTableGenerator<memprof::RecordWriterTrait>
-        RecordTableGenerator;
-    for (auto &I : MemProfRecordData) {
-      // Insert the key (func hash) and value (memprof record).
-      RecordTableGenerator.insert(I.first, I.second);
-    }
-    // Release the memory of this MapVector as it is no longer needed.
-    MemProfRecordData.clear();
-
-    // The call to Emit invokes RecordWriterTrait::EmitData which destructs
-    // the memprof record copies owned by the RecordTableGenerator. This works
-    // because the RecordTableGenerator is not used after this point.
-    uint64_t RecordTableOffset =
-        RecordTableGenerator.Emit(OS.OS, *RecordWriter);
-
-    uint64_t FramePayloadOffset = OS.tell();
-
-    auto FrameWriter = std::make_unique<memprof::FrameWriterTrait>();
-    OnDiskChainedHashTableGenerator<memprof::FrameWriterTrait>
-        FrameTableGenerator;
-    for (auto &I : MemProfFrameData) {
-      // Insert the key (frame id) and value (frame contents).
-      FrameTableGenerator.insert(I.first, I.second);
-    }
-    // Release the memory of this MapVector as it is no longer needed.
-    MemProfFrameData.clear();
-
-    uint64_t FrameTableOffset = FrameTableGenerator.Emit(OS.OS, *FrameWriter);
-
-    uint64_t Header[] = {RecordTableOffset, FramePayloadOffset,
-                         FrameTableOffset};
-    uint64_t HeaderUpdatePos = MemProfSectionStart;
-    if (MemProfVersionRequested >= memprof::Version1)
-      // The updates go just after the version field.
-      HeaderUpdatePos += sizeof(uint64_t);
-    OS.patch({{HeaderUpdatePos, Header, std::size(Header)}});
+    if (auto E = writeMemProf(OS, MemProfRecordData, MemProfFrameData,
+                              MemProfVersionRequested))
+      return E;
   }
 
   // BinaryIdSection has two parts:
diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
index 6c41981..ac0a870 100644
--- a/llvm/lib/ProfileData/MemProf.cpp
+++ b/llvm/lib/ProfileData/MemProf.cpp
@@ -10,15 +10,88 @@
 
 namespace llvm {
 namespace memprof {
+namespace {
+size_t serializedSizeV0(const IndexedAllocationInfo &IAI) {
+  size_t Size = 0;
+  // The number of frames to serialize.
+  Size += sizeof(uint64_t);
+  // The callstack frame ids.
+  Size += sizeof(FrameId) * IAI.CallStack.size();
+  // The size of the payload.
+  Size += PortableMemInfoBlock::serializedSize();
+  return Size;
+}
 
-void IndexedMemProfRecord::serialize(const MemProfSchema &Schema,
-                                     raw_ostream &OS) {
+size_t serializedSizeV2(const IndexedAllocationInfo &IAI) {
+  size_t Size = 0;
+  // The CallStackId
+  Size += sizeof(CallStackId);
+  // The size of the payload.
+  Size += PortableMemInfoBlock::serializedSize();
+  return Size;
+}
+} // namespace
+
+size_t IndexedAllocationInfo::serializedSize(IndexedVersion Version) const {
+  switch (Version) {
+  case Version0:
+  case Version1:
+    return serializedSizeV0(*this);
+  case Version2:
+    return serializedSizeV2(*this);
+  }
+  llvm_unreachable("unsupported MemProf version");
+}
+
+namespace {
+size_t serializedSizeV0(const IndexedMemProfRecord &Record) {
+  size_t Result = sizeof(GlobalValue::GUID);
+  for (const IndexedAllocationInfo &N : Record.AllocSites)
+    Result += N.serializedSize(Version0);
+
+  // The number of callsites we have information for.
+  Result += sizeof(uint64_t);
+  for (const auto &Frames : Record.CallSites) {
+    // The number of frame ids to serialize.
+    Result += sizeof(uint64_t);
+    Result += Frames.size() * sizeof(FrameId);
+  }
+  return Result;
+}
+
+size_t serializedSizeV2(const IndexedMemProfRecord &Record) {
+  size_t Result = sizeof(GlobalValue::GUID);
+  for (const IndexedAllocationInfo &N : Record.AllocSites)
+    Result += N.serializedSize(Version2);
+
+  // The number of callsites we have information for.
+  Result += sizeof(uint64_t);
+  // The CallStackId
+  Result += Record.CallSiteIds.size() * sizeof(CallStackId);
+  return Result;
+}
+} // namespace
+
+size_t IndexedMemProfRecord::serializedSize(IndexedVersion Version) const {
+  switch (Version) {
+  case Version0:
+  case Version1:
+    return serializedSizeV0(*this);
+  case Version2:
+    return serializedSizeV2(*this);
+  }
+  llvm_unreachable("unsupported MemProf version");
+}
+
+namespace {
+void serializeV0(const IndexedMemProfRecord &Record,
+                 const MemProfSchema &Schema, raw_ostream &OS) {
   using namespace support;
 
   endian::Writer LE(OS, llvm::endianness::little);
 
-  LE.write<uint64_t>(AllocSites.size());
-  for (const IndexedAllocationInfo &N : AllocSites) {
+  LE.write<uint64_t>(Record.AllocSites.size());
+  for (const IndexedAllocationInfo &N : Record.AllocSites) {
     LE.write<uint64_t>(N.CallStack.size());
     for (const FrameId &Id : N.CallStack)
       LE.write<FrameId>(Id);
@@ -26,17 +99,50 @@ void IndexedMemProfRecord::serialize(const MemProfSchema &Schema,
   }
 
   // Related contexts.
-  LE.write<uint64_t>(CallSites.size());
-  for (const auto &Frames : CallSites) {
+  LE.write<uint64_t>(Record.CallSites.size());
+  for (const auto &Frames : Record.CallSites) {
     LE.write<uint64_t>(Frames.size());
     for (const FrameId &Id : Frames)
       LE.write<FrameId>(Id);
   }
 }
 
-IndexedMemProfRecord
-IndexedMemProfRecord::deserialize(const MemProfSchema &Schema,
-                                  const unsigned char *Ptr) {
+void serializeV2(const IndexedMemProfRecord &Record,
+                 const MemProfSchema &Schema, raw_ostream &OS) {
+  using namespace support;
+
+  endian::Writer LE(OS, llvm::endianness::little);
+
+  LE.write<uint64_t>(Record.AllocSites.size());
+  for (const IndexedAllocationInfo &N : Record.AllocSites) {
+    LE.write<CallStackId>(N.CSId);
+    N.Info.serialize(Schema, OS);
+  }
+
+  // Related contexts.
+  LE.write<uint64_t>(Record.CallSiteIds.size());
+  for (const auto &CSId : Record.CallSiteIds)
+    LE.write<CallStackId>(CSId);
+}
+} // namespace
+
+void IndexedMemProfRecord::serialize(const MemProfSchema &Schema,
+                                     raw_ostream &OS, IndexedVersion Version) {
+  switch (Version) {
+  case Version0:
+  case Version1:
+    serializeV0(*this, Schema, OS);
+    return;
+  case Version2:
+    serializeV2(*this, Schema, OS);
+    return;
+  }
+  llvm_unreachable("unsupported MemProf version");
+}
+
+namespace {
+IndexedMemProfRecord deserializeV0(const MemProfSchema &Schema,
+                                   const unsigned char *Ptr) {
   using namespace support;
 
   IndexedMemProfRecord Record;
@@ -73,11 +179,57 @@ IndexedMemProfRecord::deserialize(const MemProfSchema &Schema,
       Frames.push_back(Id);
     }
     Record.CallSites.push_back(Frames);
+    Record.CallSiteIds.push_back(hashCallStack(Frames));
   }
 
   return Record;
 }
 
+IndexedMemProfRecord deserializeV2(const MemProfSchema &Schema,
+                                   const unsigned char *Ptr) {
+  using namespace support;
+
+  IndexedMemProfRecord Record;
+
+  // Read the meminfo nodes.
+  const uint64_t NumNodes =
+      endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+  for (uint64_t I = 0; I < NumNodes; I++) {
+    IndexedAllocationInfo Node;
+    Node.CSId =
+        endian::readNext<CallStackId, llvm::endianness::little, unaligned>(Ptr);
+    Node.Info.deserialize(Schema, Ptr);
+    Ptr += PortableMemInfoBlock::serializedSize();
+    Record.AllocSites.push_back(Node);
+  }
+
+  // Read the callsite information.
+  const uint64_t NumCtxs =
+      endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+  for (uint64_t J = 0; J < NumCtxs; J++) {
+    CallStackId CSId =
+        endian::readNext<CallStackId, llvm::endianness::little, unaligned>(Ptr);
+    Record.CallSiteIds.push_back(CSId);
+  }
+
+  return Record;
+}
+} // namespace
+
+IndexedMemProfRecord
+IndexedMemProfRecord::deserialize(const MemProfSchema &Schema,
+                                  const unsigned char *Ptr,
+                                  IndexedVersion Version) {
+  switch (Version) {
+  case Version0:
+  case Version1:
+    return deserializeV0(Schema, Ptr);
+  case Version2:
+    return deserializeV2(Schema, Ptr);
+  }
+  llvm_unreachable("unsupported MemProf version");
+}
+
 GlobalValue::GUID IndexedMemProfRecord::getGUID(const StringRef FunctionName) {
   // Canonicalize the function name to drop suffixes such as ".llvm.". Note
   // we do not drop any ".__uniq." suffixes, as getCanonicalFnName does not drop
diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp
index c206097..224ea09 100644
--- a/llvm/lib/Support/APInt.cpp
+++ b/llvm/lib/Support/APInt.cpp
@@ -1240,53 +1240,17 @@ APInt APInt::sqrt() const {
   return x_old + 1;
 }
 
-/// Computes the multiplicative inverse of this APInt for a given modulo. The
-/// iterative extended Euclidean algorithm is used to solve for this value,
-/// however we simplify it to speed up calculating only the inverse, and take
-/// advantage of div+rem calculations. We also use some tricks to avoid copying
-/// (potentially large) APInts around.
-/// WARNING: a value of '0' may be returned,
-///          signifying that no multiplicative inverse exists!
-APInt APInt::multiplicativeInverse(const APInt& modulo) const {
-  assert(ult(modulo) && "This APInt must be smaller than the modulo");
-
-  // Using the properties listed at the following web page (accessed 06/21/08):
-  //   http://www.numbertheory.org/php/euclid.html
-  // (especially the properties numbered 3, 4 and 9) it can be proved that
-  // BitWidth bits suffice for all the computations in the algorithm implemented
-  // below. More precisely, this number of bits suffice if the multiplicative
-  // inverse exists, but may not suffice for the general extended Euclidean
-  // algorithm.
-
-  APInt r[2] = { modulo, *this };
-  APInt t[2] = { APInt(BitWidth, 0), APInt(BitWidth, 1) };
-  APInt q(BitWidth, 0);
-
-  unsigned i;
-  for (i = 0; r[i^1] != 0; i ^= 1) {
-    // An overview of the math without the confusing bit-flipping:
-    // q = r[i-2] / r[i-1]
-    // r[i] = r[i-2] % r[i-1]
-    // t[i] = t[i-2] - t[i-1] * q
-    udivrem(r[i], r[i^1], q, r[i]);
-    t[i] -= t[i^1] * q;
-  }
-
-  // If this APInt and the modulo are not coprime, there is no multiplicative
-  // inverse, so return 0. We check this by looking at the next-to-last
-  // remainder, which is the gcd(*this,modulo) as calculated by the Euclidean
-  // algorithm.
-  if (r[i] != 1)
-    return APInt(BitWidth, 0);
-
-  // The next-to-last t is the multiplicative inverse.  However, we are
-  // interested in a positive inverse. Calculate a positive one from a negative
-  // one if necessary. A simple addition of the modulo suffices because
-  // abs(t[i]) is known to be less than *this/2 (see the link above).
-  if (t[i].isNegative())
-    t[i] += modulo;
-
-  return std::move(t[i]);
+/// \returns the multiplicative inverse of an odd APInt modulo 2^BitWidth.
+APInt APInt::multiplicativeInverse() const {
+  assert((*this)[0] &&
+         "multiplicative inverse is only defined for odd numbers!");
+
+  // Use Newton's method.
+  APInt Factor = *this;
+  APInt T;
+  while (!(T = *this * Factor).isOne())
+    Factor *= 2 - T;
+  return Factor;
 }
 
 /// Implementation of Knuth's Algorithm D (Division of nonnegative integers)
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 6425aa9..3af427d 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -391,9 +391,18 @@ def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
                                         "equivalent when the immediate does "
                                         "not fit in the encoding.">;
 
-def FeatureAddrLSLFast : SubtargetFeature<
-    "addr-lsl-fast", "HasAddrLSLFast", "true",
-    "Address operands with logical shift of up to 3 places are cheap">;
+// Address operands with shift amount 2 or 3 are fast on all Arm chips except
+// some old Apple cores (A7-A10?) which handle all shifts slowly. Cortex-A57
+// and derived designs through Cortex-X1 take an extra micro-op for shifts
+// of 1 or 4. Other Arm chips handle all shifted operands at the same speed
+// as unshifted operands.
+//
+// We don't try to model the behavior of the old Apple cores because new code
+// targeting A7 is very unlikely to actually run on an A7. The Cortex cores
+// are modeled by FeatureAddrLSLSlow14.
+def FeatureAddrLSLSlow14 : SubtargetFeature<
+    "addr-lsl-slow-14", "HasAddrLSLSlow14", "true",
+    "Address operands with shift amount of 1 or 4 are slow">;
 
 def FeatureALULSLFast : SubtargetFeature<
     "alu-lsl-fast", "HasALULSLFast", "true",
@@ -885,6 +894,7 @@ def TuneA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
                                    FeatureBalanceFPOps,
                                    FeatureFuseAdrpAdd,
                                    FeatureFuseLiterals,
+                                   FeatureAddrLSLSlow14,
                                    FeaturePostRAScheduler,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
@@ -903,6 +913,7 @@ def TuneA72     : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
                                    FeatureFuseLiterals,
+                                   FeatureAddrLSLSlow14,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
 
@@ -910,6 +921,7 @@ def TuneA73     : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
                                    "Cortex-A73 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
+                                   FeatureAddrLSLSlow14,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
 
@@ -917,6 +929,7 @@ def TuneA75     : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75",
                                    "Cortex-A75 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
+                                   FeatureAddrLSLSlow14,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
 
@@ -924,7 +937,7 @@ def TuneA76     : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
                                    "Cortex-A76 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeatureAddrLSLFast,
+                                   FeatureAddrLSLSlow14,
                                    FeatureALULSLFast,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
@@ -934,7 +947,7 @@ def TuneA77     : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77",
                                    FeatureCmpBccFusion,
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeatureAddrLSLFast,
+                                   FeatureAddrLSLSlow14,
                                    FeatureALULSLFast,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
@@ -944,7 +957,7 @@ def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78",
                                FeatureCmpBccFusion,
                                FeatureFuseAES,
                                FeatureFuseAdrpAdd,
-                               FeatureAddrLSLFast,
+                               FeatureAddrLSLSlow14,
                                FeatureALULSLFast,
                                FeaturePostRAScheduler,
                                FeatureEnableSelectOptimize,
@@ -956,7 +969,7 @@ def TuneA78AE : SubtargetFeature<"a78ae", "ARMProcFamily",
                                  FeatureCmpBccFusion,
                                  FeatureFuseAES,
                                  FeatureFuseAdrpAdd,
-                                 FeatureAddrLSLFast,
+                                 FeatureAddrLSLSlow14,
                                  FeatureALULSLFast,
                                  FeaturePostRAScheduler,
                                  FeatureEnableSelectOptimize,
@@ -968,7 +981,7 @@ def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily",
                                 FeatureCmpBccFusion,
                                 FeatureFuseAES,
                                 FeatureFuseAdrpAdd,
-                                FeatureAddrLSLFast,
+                                FeatureAddrLSLSlow14,
                                 FeatureALULSLFast,
                                 FeaturePostRAScheduler,
                                 FeatureEnableSelectOptimize,
@@ -979,7 +992,6 @@ def TuneA710    : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710",
                                    FeatureCmpBccFusion,
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeatureAddrLSLFast,
                                    FeatureALULSLFast,
                                    FeaturePostRAScheduler,
                                    FeatureEnableSelectOptimize,
@@ -990,7 +1002,6 @@ def TuneA715 : SubtargetFeature<"a715", "ARMProcFamily", "CortexA715",
                                  FeatureFuseAES,
                                  FeaturePostRAScheduler,
                                  FeatureCmpBccFusion,
-                                 FeatureAddrLSLFast,
                                  FeatureALULSLFast,
                                  FeatureFuseAdrpAdd,
                                  FeatureEnableSelectOptimize,
@@ -1001,7 +1012,6 @@ def TuneA720 : SubtargetFeature<"a720", "ARMProcFamily", "CortexA720",
                                  FeatureFuseAES,
                                  FeaturePostRAScheduler,
                                  FeatureCmpBccFusion,
-                                 FeatureAddrLSLFast,
                                  FeatureALULSLFast,
                                  FeatureFuseAdrpAdd,
                                  FeatureEnableSelectOptimize,
@@ -1012,7 +1022,6 @@ def TuneA720AE : SubtargetFeature<"a720ae", "ARMProcFamily", "CortexA720",
                                  FeatureFuseAES,
                                  FeaturePostRAScheduler,
                                  FeatureCmpBccFusion,
-                                 FeatureAddrLSLFast,
                                  FeatureALULSLFast,
                                  FeatureFuseAdrpAdd,
                                  FeatureEnableSelectOptimize,
@@ -1028,7 +1037,7 @@ def TuneX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
                                   FeatureCmpBccFusion,
                                   FeatureFuseAES,
                                   FeatureFuseAdrpAdd,
-                                  FeatureAddrLSLFast,
+                                  FeatureAddrLSLSlow14,
                                   FeatureALULSLFast,
                                   FeaturePostRAScheduler,
                                   FeatureEnableSelectOptimize,
@@ -1039,7 +1048,6 @@ def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2",
                                   FeatureCmpBccFusion,
                                   FeatureFuseAES,
                                   FeatureFuseAdrpAdd,
-                                  FeatureAddrLSLFast,
                                   FeatureALULSLFast,
                                   FeaturePostRAScheduler,
                                   FeatureEnableSelectOptimize,
@@ -1047,7 +1055,6 @@ def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2",
 
 def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
                               "Cortex-X3 ARM processors", [
-                               FeatureAddrLSLFast,
                                FeatureALULSLFast,
                                FeatureFuseAdrpAdd,
                                FeatureFuseAES,
@@ -1057,7 +1064,6 @@ def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
 
 def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4",
                               "Cortex-X4 ARM processors", [
-                               FeatureAddrLSLFast,
                                FeatureALULSLFast,
                                FeatureFuseAdrpAdd,
                                FeatureFuseAES,
@@ -1215,7 +1221,6 @@ def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
                                      FeatureFuseAdrpAdd,
                                      FeatureFuseLiterals,
                                      FeatureStorePairSuppress,
-                                     FeatureAddrLSLFast,
                                      FeatureALULSLFast,
                                      FeaturePostRAScheduler,
                                      FeaturePredictableSelectIsExpensive]>;
@@ -1234,7 +1239,6 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
                                      FeatureFuseAdrpAdd,
                                      FeatureFuseLiterals,
                                      FeatureStorePairSuppress,
-                                     FeatureAddrLSLFast,
                                      FeatureALULSLFast,
                                      FeaturePostRAScheduler,
                                      FeatureZCZeroing]>;
@@ -1244,7 +1248,6 @@ def TuneKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
-                                   FeatureAddrLSLFast,
                                    FeatureALULSLFast,
                                    FeatureStorePairSuppress]>;
 
@@ -1254,7 +1257,6 @@ def TuneFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
                                    FeatureStorePairSuppress,
-                                   FeatureAddrLSLFast,
                                    FeatureALULSLFast,
                                    FeatureSlowSTRQro]>;
 
@@ -1268,7 +1270,7 @@ def TuneNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily", "NeoverseN1
                                       "Neoverse N1 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureAddrLSLFast,
+                                      FeatureAddrLSLSlow14,
                                       FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
@@ -1278,7 +1280,6 @@ def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2
                                       "Neoverse N2 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureAddrLSLFast,
                                       FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
@@ -1288,7 +1289,6 @@ def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Ne
                                       "Neoverse 512-TVB ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureAddrLSLFast,
                                       FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
@@ -1298,7 +1298,7 @@ def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1
                                       "Neoverse V1 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureAddrLSLFast,
+                                      FeatureAddrLSLSlow14,
                                       FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
@@ -1309,7 +1309,6 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
                                       "Neoverse V2 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureAddrLSLFast,
                                       FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
@@ -1321,7 +1320,6 @@ def TuneSaphira  : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
                                    FeatureStorePairSuppress,
-                                   FeatureAddrLSLFast,
                                    FeatureALULSLFast]>;
 
 def TuneThunderX2T99  : SubtargetFeature<"thunderx2t99", "ARMProcFamily", "ThunderX2T99",
@@ -1381,7 +1379,6 @@ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
                                    FeaturePostRAScheduler,
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeatureAddrLSLFast,
                                    FeatureALULSLFast,
                                    FeatureAggressiveFMA,
                                    FeatureArithmeticBccFusion,
@@ -1397,7 +1394,6 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
                                     FeaturePostRAScheduler,
                                     FeatureFuseAES,
                                     FeatureFuseAdrpAdd,
-                                    FeatureAddrLSLFast,
                                     FeatureALULSLFast,
                                     FeatureAggressiveFMA,
                                     FeatureArithmeticBccFusion,
@@ -1414,7 +1410,6 @@ def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B",
                                     FeaturePostRAScheduler,
                                     FeatureFuseAES,
                                     FeatureFuseAdrpAdd,
-                                    FeatureAddrLSLFast,
                                     FeatureALULSLFast,
                                     FeatureAggressiveFMA,
                                     FeatureArithmeticBccFusion,
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 4fa719a..f6ccd0e 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -268,13 +268,19 @@ void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) {
     if (Sign->getZExtValue())
       Flags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
 
-  if (Flags == 0)
-    return;
+  uint64_t PAuthABIPlatform = -1;
+  if (const auto *PAP = mdconst::extract_or_null<ConstantInt>(
+          M.getModuleFlag("aarch64-elf-pauthabi-platform")))
+    PAuthABIPlatform = PAP->getZExtValue();
+  uint64_t PAuthABIVersion = -1;
+  if (const auto *PAV = mdconst::extract_or_null<ConstantInt>(
+          M.getModuleFlag("aarch64-elf-pauthabi-version")))
+    PAuthABIVersion = PAV->getZExtValue();
 
   // Emit a .note.gnu.property section with the flags.
   auto *TS =
       static_cast<AArch64TargetStreamer *>(OutStreamer->getTargetStreamer());
-  TS->emitNoteSection(Flags);
+  TS->emitNoteSection(Flags, PAuthABIPlatform, PAuthABIVersion);
 }
 
 void AArch64AsmPrinter::emitFunctionHeaderComment() {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 163ed52..51bec36 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -462,7 +462,7 @@ private:
                          SDValue &Offset, SDValue &SignExtend,
                          SDValue &DoShift);
   bool isWorthFoldingALU(SDValue V, bool LSL = false) const;
-  bool isWorthFoldingAddr(SDValue V) const;
+  bool isWorthFoldingAddr(SDValue V, unsigned Size) const;
   bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
                          SDValue &Offset, SDValue &SignExtend);
 
@@ -674,17 +674,22 @@ static bool isWorthFoldingSHL(SDValue V) {
 
 /// Determine whether it is worth to fold V into an extended register addressing
 /// mode.
-bool AArch64DAGToDAGISel::isWorthFoldingAddr(SDValue V) const {
+bool AArch64DAGToDAGISel::isWorthFoldingAddr(SDValue V, unsigned Size) const {
   // Trivial if we are optimizing for code size or if there is only
   // one use of the value.
   if (CurDAG->shouldOptForSize() || V.hasOneUse())
     return true;
-  // If a subtarget has a fastpath LSL we can fold a logical shift into
-  // the addressing mode and save a cycle.
-  if (Subtarget->hasAddrLSLFast() && V.getOpcode() == ISD::SHL &&
-      isWorthFoldingSHL(V))
+
+  // If a subtarget has a slow shift, folding a shift into multiple loads
+  // costs additional micro-ops.
+  if (Subtarget->hasAddrLSLSlow14() && (Size == 2 || Size == 16))
+    return false;
+
+  // Check whether we're going to emit the address arithmetic anyway because
+  // it's used by a non-address operation.
+  if (V.getOpcode() == ISD::SHL && isWorthFoldingSHL(V))
     return true;
-  if (Subtarget->hasAddrLSLFast() && V.getOpcode() == ISD::ADD) {
+  if (V.getOpcode() == ISD::ADD) {
     const SDValue LHS = V.getOperand(0);
     const SDValue RHS = V.getOperand(1);
     if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS))
@@ -1203,7 +1208,7 @@ bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
   if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
     return false;
 
-  return isWorthFoldingAddr(N);
+  return isWorthFoldingAddr(N, Size);
 }
 
 bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
@@ -1231,7 +1236,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
   }
 
   // Remember if it is worth folding N when it produces extended register.
-  bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N);
+  bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N, Size);
 
   // Try to match a shifted extend on the RHS.
   if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
@@ -1261,7 +1266,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
     Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
     SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
                                            MVT::i32);
-    if (isWorthFoldingAddr(LHS))
+    if (isWorthFoldingAddr(LHS, Size))
       return true;
   }
 
@@ -1273,7 +1278,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
     Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
     SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
                                            MVT::i32);
-    if (isWorthFoldingAddr(RHS))
+    if (isWorthFoldingAddr(RHS, Size))
       return true;
   }
 
@@ -1343,7 +1348,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
   }
 
   // Remember if it is worth folding N when it produces extended register.
-  bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N);
+  bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N, Size);
 
   // Try to match a shifted extend on the RHS.
   if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index d0c5e6b..22687b0 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2993,7 +2993,7 @@ bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
       return false;
     Shift = AArch64_AM::getShiftValue(Shift);
     if (!OptSize) {
-      if ((Shift != 2 && Shift != 3) || !Subtarget.hasAddrLSLFast())
+      if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
         return false;
       if (avoidSlowSTRQ(MemI))
         return false;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index a8f2c45..d4daf17 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -6907,10 +6907,8 @@ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
       MI.getParent()->getParent()->getFunction().hasOptSize())
     return true;
 
-  // It's better to avoid folding and recomputing shifts when we don't have a
-  // fastpath.
-  if (!STI.hasAddrLSLFast())
-    return false;
+  // FIXME: Consider checking HasAddrLSLSlow14 and HasALULSLFast as
+  // appropriate.
 
   // We have a fastpath, so folding a shift in and potentially computing it
   // many times may be beneficial. Check if this is only used in memory ops.
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 33dba6a5..043f142 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1141,9 +1141,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .scalarize(1)
       .lower();
 
-  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
-      .lowerIf([=](const LegalityQuery &Q) { return Q.Types[0].isScalar(); });
-
   getActionDefinitionsBuilder({G_FSHL, G_FSHR})
       .customFor({{s32, s32}, {s32, s64}, {s64, s64}})
       .lower();
@@ -1191,8 +1188,14 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .minScalarEltSameAsIf(always, 1, 0)
       .maxScalarEltSameAsIf(always, 1, 0);
 
-  // TODO: Vector types.
-  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}).lowerIf(isScalar(0));
+  getActionDefinitionsBuilder({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT})
+      .legalFor({v2s64, v2s32, v4s32, v4s16, v8s16, v8s8, v16s8})
+      .clampNumElements(0, v8s8, v16s8)
+      .clampNumElements(0, v4s16, v8s16)
+      .clampNumElements(0, v2s32, v4s32)
+      .clampMaxNumElements(0, s64, 2)
+      .moreElementsToNextPow2(0)
+      .lower();
 
   // TODO: Libcall support for s128.
   // TODO: s16 should be legal with full FP16 support.
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index e1d6dd7..dc5383c 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -58,8 +58,17 @@ void AArch64TargetStreamer::finish() {
     emitNoteSection(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI);
 }
 
-void AArch64TargetStreamer::emitNoteSection(unsigned Flags) {
-  if (Flags == 0)
+void AArch64TargetStreamer::emitNoteSection(unsigned Flags,
+                                            uint64_t PAuthABIPlatform,
+                                            uint64_t PAuthABIVersion) {
+  assert((PAuthABIPlatform == uint64_t(-1)) ==
+         (PAuthABIVersion == uint64_t(-1)));
+  uint64_t DescSz = 0;
+  if (Flags != 0)
+    DescSz += 4 * 4;
+  if (PAuthABIPlatform != uint64_t(-1))
+    DescSz += 4 + 4 + 8 * 2;
+  if (DescSz == 0)
     return;
 
   MCStreamer &OutStreamer = getStreamer();
@@ -80,15 +89,25 @@ void AArch64TargetStreamer::emitNoteSection(unsigned Flags) {
   // Emit the note header.
   OutStreamer.emitValueToAlignment(Align(8));
   OutStreamer.emitIntValue(4, 4);     // data size for "GNU\0"
-  OutStreamer.emitIntValue(4 * 4, 4); // Elf_Prop size
+  OutStreamer.emitIntValue(DescSz, 4); // Elf_Prop array size
   OutStreamer.emitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4);
   OutStreamer.emitBytes(StringRef("GNU", 4)); // note name
 
   // Emit the PAC/BTI properties.
-  OutStreamer.emitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND, 4);
-  OutStreamer.emitIntValue(4, 4);     // data size
-  OutStreamer.emitIntValue(Flags, 4); // data
-  OutStreamer.emitIntValue(0, 4);     // pad
+  if (Flags != 0) {
+    OutStreamer.emitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND, 4);
+    OutStreamer.emitIntValue(4, 4);     // data size
+    OutStreamer.emitIntValue(Flags, 4); // data
+    OutStreamer.emitIntValue(0, 4);     // pad
+  }
+
+  // Emit the PAuth ABI compatibility info
+  if (PAuthABIPlatform != uint64_t(-1)) {
+    OutStreamer.emitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_PAUTH, 4);
+    OutStreamer.emitIntValue(8 * 2, 4); // data size
+    OutStreamer.emitIntValue(PAuthABIPlatform, 8);
+    OutStreamer.emitIntValue(PAuthABIVersion, 8);
+  }
 
   OutStreamer.endSection(Nt);
   OutStreamer.switchSection(Cur);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
index 7676d88..e8a9dc4 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -35,7 +35,8 @@ public:
   void emitCurrentConstantPool();
 
   /// Callback used to implement the .note.gnu.property section.
-  void emitNoteSection(unsigned Flags);
+  void emitNoteSection(unsigned Flags, uint64_t PAuthABIPlatform = -1,
+                       uint64_t PAuthABIVersion = -1);
 
   /// Callback used to implement the .inst directive.
   virtual void emitInst(uint32_t Inst);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 9083150..1114a8c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1086,7 +1086,7 @@ void SplitPtrStructs::processConditionals() {
       if (MaybeRsrc)
         for (Value *V : Seen)
           FoundRsrcs[cast<Instruction>(V)] = NewRsrc;
-    } else if (auto *SI = dyn_cast<SelectInst>(I)) {
+    } else if (isa<SelectInst>(I)) {
       if (MaybeRsrc) {
         ConditionalTemps.push_back(cast<Instruction>(Rsrc));
         Rsrc->replaceAllUsesWith(*MaybeRsrc);
@@ -1777,8 +1777,8 @@ void SplitPtrStructs::processFunction(Function &F) {
     Originals.push_back(&I);
   for (Instruction *I : Originals) {
     auto [Rsrc, Off] = visit(I);
-    assert((Rsrc && Off) ||
-           (!Rsrc && !Off) && "Can't have a resource but no offset");
+    assert(((Rsrc && Off) || (!Rsrc && !Off)) &&
+           "Can't have a resource but no offset");
     if (Rsrc)
       RsrcParts[I] = Rsrc;
     if (Off)
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 294fc68..3866723 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -4627,10 +4627,15 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst,
     if (Src1Idx >= 0) {
       const MCOperand &Src1 = Inst.getOperand(Src1Idx);
       const MCRegisterInfo *TRI = getContext().getRegisterInfo();
-      if (Src1.isImm() ||
-          (Src1.isReg() && isSGPR(mc2PseudoReg(Src1.getReg()), TRI))) {
-        AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[Src1Idx]);
-        Error(Op.getStartLoc(), "invalid operand for instruction");
+      if (Src1.isReg() && isSGPR(mc2PseudoReg(Src1.getReg()), TRI)) {
+        auto Reg = mc2PseudoReg(Inst.getOperand(Src1Idx).getReg());
+        SMLoc S = getRegLoc(Reg, Operands);
+        Error(S, "invalid operand for instruction");
+        return false;
+      }
+      if (Src1.isImm()) {
+        Error(getInstLoc(Operands),
+              "src1 immediate operand invalid for instruction");
         return false;
       }
     }
diff --git a/llvm/lib/Target/AMDGPU/DSDIRInstructions.td b/llvm/lib/Target/AMDGPU/DSDIRInstructions.td
index f4f02d2..0541f0f 100644
--- a/llvm/lib/Target/AMDGPU/DSDIRInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSDIRInstructions.td
@@ -112,7 +112,7 @@ class DSDIR_Real<DSDIR_Pseudo lds, dag ins, string asm, int subtarget> :
                lds.Mnemonic # asm,
                ins,
                lds.is_direct>,
-  SIMCInstr <lds.Mnemonic, subtarget> {
+  SIMCInstr <lds.PseudoInstr, subtarget> {
   let isPseudo = 0;
   let isCodeGenOnly = 0;
 
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index e944dde..0773ef7 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1192,7 +1192,7 @@ def : GCNPat <
 class Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps, int ef,
                                                string opName = ps.Mnemonic,
                                                bit hasGDS = true>
-    : DS_Real<ps, opName>, SIMCInstr <ps.Mnemonic, ef> {
+    : DS_Real<ps, opName>, SIMCInstr <ps.PseudoInstr, ef> {
 
   let Inst{7-0}   = !if(ps.has_offset0, offset0, 0);
   let Inst{15-8}  = !if(ps.has_offset1, offset1, 0);
@@ -1557,7 +1557,7 @@ defm DS_MAX_SRC2_F64        : DS_Real_gfx6_gfx7_gfx10<0x0d3>;
 
 class DS_Real_vi <bits<8> op, DS_Pseudo ps> :
   DS_Real <ps>,
-  SIMCInstr <ps.Mnemonic, SIEncodingFamily.VI> {
+  SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
   let AssemblerPredicate = isGFX8GFX9;
   let DecoderNamespace = "GFX8";
 
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index d017ec4..27d5616 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -2558,7 +2558,7 @@ multiclass VFLAT_Real_Base_gfx12<bits<8> op,
 
 multiclass VFLAT_Real_Atomics_gfx12<bits<8> op,
                                     string name = get_FLAT_ps<NAME>.Mnemonic,
-                                    string alias = ""> :
+                                    string alias = name> :
   VFLAT_Real_Base_gfx12<op, name, alias> {
   defm _RTN : VFLAT_Real_gfx12<op, name>;
 }
@@ -2581,7 +2581,7 @@ multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op,
 
 multiclass VGLOBAL_Real_Atomics_gfx12<bits<8> op,
                                       string name = get_FLAT_ps<NAME>.Mnemonic,
-                                      string alias = ""> :
+                                      string alias = name> :
   VGLOBAL_Real_AllAddr_gfx12<op, name, alias> {
   defm _RTN : VFLAT_Real_gfx12<op, name>;
   defm _SADDR_RTN : VFLAT_Real_gfx12<op, name>;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 2762190..bb499c5 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -708,9 +708,6 @@ public:
                                  WaitcntBrackets &ScoreBrackets,
                                  MachineInstr *OldWaitcntInstr,
                                  bool FlushVmCnt);
-  bool generateWaitcntBlockEnd(MachineBasicBlock &Block,
-                               WaitcntBrackets &ScoreBrackets,
-                               MachineInstr *OldWaitcntInstr);
   bool generateWaitcnt(AMDGPU::Waitcnt Wait,
                        MachineBasicBlock::instr_iterator It,
                        MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
@@ -1902,31 +1899,6 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
                          OldWaitcntInstr);
 }
 
-// Add a waitcnt to flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the
-// end of the given block if needed.
-bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block,
-                                               WaitcntBrackets &ScoreBrackets,
-                                               MachineInstr *OldWaitcntInstr) {
-  AMDGPU::Waitcnt Wait;
-
-  unsigned LoadCntPending = ScoreBrackets.hasPendingEvent(LOAD_CNT);
-  unsigned SampleCntPending = ScoreBrackets.hasPendingEvent(SAMPLE_CNT);
-  unsigned BvhCntPending = ScoreBrackets.hasPendingEvent(BVH_CNT);
-
-  if (LoadCntPending == 0 && SampleCntPending == 0 && BvhCntPending == 0)
-    return false;
-
-  if (LoadCntPending != 0)
-    Wait.LoadCnt = 0;
-  if (SampleCntPending != 0)
-    Wait.SampleCnt = 0;
-  if (BvhCntPending != 0)
-    Wait.BvhCnt = 0;
-
-  return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
-                         OldWaitcntInstr);
-}
-
 bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
                                        MachineBasicBlock::instr_iterator It,
                                        MachineBasicBlock &Block,
@@ -2355,9 +2327,22 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
     ++Iter;
   }
 
+  // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
+  // needed.
+  AMDGPU::Waitcnt Wait;
   if (Block.getFirstTerminator() == Block.end() &&
-      isPreheaderToFlush(Block, ScoreBrackets))
-    Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr);
+      isPreheaderToFlush(Block, ScoreBrackets)) {
+    if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
+      Wait.LoadCnt = 0;
+    if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
+      Wait.SampleCnt = 0;
+    if (ScoreBrackets.hasPendingEvent(BVH_CNT))
+      Wait.BvhCnt = 0;
+  }
+
+  // Combine or remove any redundant waitcnts at the end of the block.
+  Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
+                              OldWaitcntInstr);
 
   return Modified;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 1694436..f1afbcc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2268,7 +2268,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field Operand Src1ModDPP = getSrcModDPP<Src1VT>.ret;
   field Operand Src2ModDPP = getSrcModDPP<Src2VT>.ret;
   field Operand Src0ModVOP3DPP = getSrcModDPP<Src0VT>.ret;
-  field Operand Src1ModVOP3DPP = getSrcModDPP<Src1VT>.ret;
+  field Operand Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT>.ret;
   field Operand Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT>.ret;
   field Operand Src0ModSDWA = getSrcModSDWA<Src0VT>.ret;
   field Operand Src1ModSDWA = getSrcModSDWA<Src1VT>.ret;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index d34ee34..0b7d45e 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1972,7 +1972,7 @@ class Select_gfx6_gfx7<string opName> : SIMCInstr<opName, SIEncodingFamily.SI> {
 multiclass SOP1_Real_gfx11<bits<8> op, string name = !tolower(NAME)> {
   defvar ps = !cast<SOP1_Pseudo>(NAME);
   def _gfx11 : SOP1_Real<op, ps, name>,
-               Select_gfx11<ps.Mnemonic>;
+               Select_gfx11<ps.PseudoInstr>;
   if !ne(ps.Mnemonic, name) then
     def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX11Only]>;
 }
@@ -1980,14 +1980,14 @@ multiclass SOP1_Real_gfx11<bits<8> op, string name = !tolower(NAME)> {
 multiclass SOP1_Real_gfx12<bits<8> op, string name = !tolower(NAME)> {
   defvar ps = !cast<SOP1_Pseudo>(NAME);
   def _gfx12 : SOP1_Real<op, ps, name>,
-               Select_gfx12<ps.Mnemonic>;
+               Select_gfx12<ps.PseudoInstr>;
   if !ne(ps.Mnemonic, name) then
     def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX12Plus]>;
 }
 
 multiclass SOP1_M0_Real_gfx12<bits<8> op> {
   def _gfx12 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>,
-               Select_gfx12<!cast<SOP1_Pseudo>(NAME).Mnemonic> {
+               Select_gfx12<!cast<SOP1_Pseudo>(NAME).PseudoInstr> {
     let Inst{7-0} = M0_gfx11plus.HWEncoding{7-0}; // Set Src0 encoding to M0
   }
 }
@@ -1995,7 +1995,7 @@ multiclass SOP1_M0_Real_gfx12<bits<8> op> {
 multiclass SOP1_IMM_Real_gfx12<bits<8> op> {
   defvar ps = !cast<SOP1_Pseudo>(NAME);
   def _gfx12 : SOP1_Real<op, ps>,
-               Select_gfx12<ps.Mnemonic>;
+               Select_gfx12<ps.PseudoInstr>;
 }
 
 multiclass SOP1_Real_gfx11_gfx12<bits<8> op, string name = !tolower(NAME)> :
@@ -2106,7 +2106,7 @@ defm S_RNDNE_F16         : SOP1_Real_gfx11_gfx12<0x06e>;
 multiclass SOP1_Real_gfx10<bits<8> op> {
   defvar ps = !cast<SOP1_Pseudo>(NAME);
   def _gfx10 : SOP1_Real<op, ps>,
-               Select_gfx10<ps.Mnemonic>;
+               Select_gfx10<ps.PseudoInstr>;
 }
 
 multiclass SOP1_Real_gfx10_gfx11_gfx12<bits<8> op> :
@@ -2139,7 +2139,7 @@ defm S_MOVRELSD_2_B32       : SOP1_Real_gfx10<0x049>;
 multiclass SOP1_Real_gfx6_gfx7<bits<8> op> {
   defvar ps = !cast<SOP1_Pseudo>(NAME);
   def _gfx6_gfx7 : SOP1_Real<op, ps>,
-                   Select_gfx6_gfx7<ps.Mnemonic>;
+                   Select_gfx6_gfx7<ps.PseudoInstr>;
 }
 
 multiclass SOP1_Real_gfx6_gfx7_gfx10<bits<8> op> :
@@ -2205,7 +2205,7 @@ defm S_ABS_I32            : SOP1_Real_gfx6_gfx7_gfx10<0x034>;
 multiclass SOP2_Real_gfx12<bits<7> op, string name = !tolower(NAME)> {
   defvar ps = !cast<SOP2_Pseudo>(NAME);
   def _gfx12 : SOP2_Real32<op, ps, name>,
-               Select_gfx12<ps.Mnemonic>;
+               Select_gfx12<ps.PseudoInstr>;
   if !ne(ps.Mnemonic, name) then
     def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX12Plus]>;
 }
@@ -2222,7 +2222,7 @@ defm S_MAXIMUM_F16 : SOP2_Real_gfx12<0x052>;
 multiclass SOP2_Real_gfx11<bits<7> op, string name = !tolower(NAME)> {
   defvar ps = !cast<SOP2_Pseudo>(NAME);
   def _gfx11 : SOP2_Real32<op, ps, name>,
-               Select_gfx11<ps.Mnemonic>;
+               Select_gfx11<ps.PseudoInstr>;
   if !ne(ps.Mnemonic, name) then
     def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX11Only]>;
 }
@@ -2283,12 +2283,12 @@ defm S_MUL_U64         : SOP2_Real_gfx12<0x055>;
 
 multiclass SOP2_Real_FMAK_gfx12<bits<7> op> {
   def _gfx12 : SOP2_Real64<op, !cast<SOP2_Pseudo>(NAME)>,
-               Select_gfx12<!cast<SOP2_Pseudo>(NAME).Mnemonic>;
+               Select_gfx12<!cast<SOP2_Pseudo>(NAME).PseudoInstr>;
 }
 
 multiclass SOP2_Real_FMAK_gfx11<bits<7> op> {
   def _gfx11 : SOP2_Real64<op, !cast<SOP2_Pseudo>(NAME)>,
-               Select_gfx11<!cast<SOP2_Pseudo>(NAME).Mnemonic>;
+               Select_gfx11<!cast<SOP2_Pseudo>(NAME).PseudoInstr>;
 }
 
 multiclass SOP2_Real_FMAK_gfx11_gfx12<bits<7> op> :
@@ -2325,7 +2325,7 @@ defm S_MAX_F16 : SOP2_Real_gfx11_Renamed_gfx12<0x04c, "s_max_num_f16">;
 multiclass SOP2_Real_gfx10<bits<7> op> {
   defvar ps = !cast<SOP2_Pseudo>(NAME);
   def _gfx10 : SOP2_Real32<op, ps>,
-               Select_gfx10<ps.Mnemonic>;
+               Select_gfx10<ps.PseudoInstr>;
 }
 
 multiclass SOP2_Real_gfx10_gfx11_gfx12<bits<7> op> :
@@ -2348,7 +2348,7 @@ defm S_MUL_HI_I32      : SOP2_Real_gfx10<0x036>;
 multiclass SOP2_Real_gfx6_gfx7<bits<7> op> {
   defvar ps = !cast<SOP2_Pseudo>(NAME);
   def _gfx6_gfx7 : SOP2_Real32<op, ps>,
-                   Select_gfx6_gfx7<ps.Mnemonic>;
+                   Select_gfx6_gfx7<ps.PseudoInstr>;
 }
 
 multiclass SOP2_Real_gfx6_gfx7_gfx10<bits<7> op> :
@@ -2410,24 +2410,24 @@ defm S_ABSDIFF_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x02c>;
 multiclass SOPK_Real32_gfx12<bits<5> op, string name = !tolower(NAME)> {
   defvar ps = !cast<SOPK_Pseudo>(NAME);
   def _gfx12 : SOPK_Real32<op, ps, name>,
-               Select_gfx12<ps.Mnemonic>;
+               Select_gfx12<ps.PseudoInstr>;
   if !ne(ps.Mnemonic, name) then
     def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX12Plus]>;
 }
 
 multiclass SOPK_Real32_gfx11<bits<5> op> {
   def _gfx11 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>,
-               Select_gfx11<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+               Select_gfx11<!cast<SOPK_Pseudo>(NAME).PseudoInstr>;
 }
 
 multiclass SOPK_Real64_gfx12<bits<5> op> {
   def _gfx12 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>,
-               Select_gfx12<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+               Select_gfx12<!cast<SOPK_Pseudo>(NAME).PseudoInstr>;
 }
 
 multiclass SOPK_Real64_gfx11<bits<5> op> {
   def _gfx11 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>,
-               Select_gfx11<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+               Select_gfx11<!cast<SOPK_Pseudo>(NAME).PseudoInstr>;
 }
 
 multiclass SOPK_Real32_gfx11_gfx12<bits<5> op> :
@@ -2454,13 +2454,13 @@ defm S_WAITCNT_LGKMCNT      : SOPK_Real32_gfx11<0x01b>;
 multiclass SOPK_Real32_gfx10<bits<5> op> {
   defvar ps = !cast<SOPK_Pseudo>(NAME);
   def _gfx10 : SOPK_Real32<op, ps>,
-               Select_gfx10<ps.Mnemonic>;
+               Select_gfx10<ps.PseudoInstr>;
 }
 
 multiclass SOPK_Real64_gfx10<bits<5> op> {
   defvar ps = !cast<SOPK_Pseudo>(NAME);
   def _gfx10 : SOPK_Real64<op, ps>,
-               Select_gfx10<ps.Mnemonic>;
+               Select_gfx10<ps.PseudoInstr>;
 }
 
 multiclass SOPK_Real32_gfx10_gfx11<bits<5> op> :
@@ -2485,13 +2485,13 @@ defm S_SUBVECTOR_LOOP_END   : SOPK_Real32_gfx10<0x01c>;
 multiclass SOPK_Real32_gfx6_gfx7<bits<5> op> {
   defvar ps = !cast<SOPK_Pseudo>(NAME);
   def _gfx6_gfx7 : SOPK_Real32<op, ps>,
-                   Select_gfx6_gfx7<ps.Mnemonic>;
+                   Select_gfx6_gfx7<ps.PseudoInstr>;
 }
 
 multiclass SOPK_Real64_gfx6_gfx7<bits<5> op> {
   defvar ps = !cast<SOPK_Pseudo>(NAME);
   def _gfx6_gfx7 : SOPK_Real64<op, ps>,
-                   Select_gfx6_gfx7<ps.Mnemonic>;
+                   Select_gfx6_gfx7<ps.PseudoInstr>;
 }
 
 multiclass SOPK_Real32_gfx6_gfx7_gfx10<bits<5> op> :
@@ -2539,7 +2539,7 @@ defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx6_gfx7_gfx10<0x015>;
 multiclass SOPP_Real_32_gfx12<bits<7> op, string name = !tolower(NAME)> {
   defvar ps = !cast<SOPP_Pseudo>(NAME);
   def _gfx12 : SOPP_Real_32<op, ps, name>,
-               Select_gfx12<ps.Mnemonic>;
+               Select_gfx12<ps.PseudoInstr>;
   if !ne(ps.Mnemonic, name) then
     def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX12Plus]>;
 }
@@ -2564,7 +2564,7 @@ defm S_WAIT_STORECNT_DSCNT  : SOPP_Real_32_gfx12<0x049>;
 multiclass SOPP_Real_32_gfx11<bits<7> op, string name = !tolower(NAME)> {
   defvar ps = !cast<SOPP_Pseudo>(NAME);
   def _gfx11 : SOPP_Real_32<op, ps, name>,
-               Select_gfx11<ps.Mnemonic>,
+               Select_gfx11<ps.PseudoInstr>,
                SOPPRelaxTable<0, ps.KeyName, "_gfx11">;
   if !ne(ps.Mnemonic, name) then
     def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX11Only]>;
@@ -2572,13 +2572,13 @@ multiclass SOPP_Real_32_gfx11<bits<7> op, string name = !tolower(NAME)> {
 
 multiclass SOPP_Real_64_gfx12<bits<7> op> {
   def _gfx12 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>,
-               Select_gfx12<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
+               Select_gfx12<!cast<SOPP_Pseudo>(NAME).PseudoInstr>,
                SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx12">;
 }
 
 multiclass SOPP_Real_64_gfx11<bits<7> op> {
   def _gfx11 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>,
-               Select_gfx11<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
+               Select_gfx11<!cast<SOPP_Pseudo>(NAME).PseudoInstr>,
                SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx11">;
 }
 
@@ -2654,21 +2654,21 @@ defm S_SINGLEUSE_VDST             : SOPP_Real_32_gfx11_gfx12<0x013>;
 multiclass SOPP_Real_32_gfx6_gfx7<bits<7> op> {
   defvar ps = !cast<SOPP_Pseudo>(NAME);
   def _gfx6_gfx7 : SOPP_Real_32<op, ps, !cast<SOPP_Pseudo>(NAME).Mnemonic>,
-                   Select_gfx6_gfx7<ps.Mnemonic>,
+                   Select_gfx6_gfx7<ps.PseudoInstr>,
                    SOPPRelaxTable<0, ps.KeyName, "_gfx6_gfx7">;
 }
 
 multiclass SOPP_Real_32_gfx8_gfx9<bits<7> op> {
   defvar ps = !cast<SOPP_Pseudo>(NAME);
   def _vi : SOPP_Real_32<op, ps>,
-            Select_vi<ps.Mnemonic>,
+            Select_vi<ps.PseudoInstr>,
             SOPPRelaxTable<0, ps.KeyName, "_vi">;
 }
 
 multiclass SOPP_Real_32_gfx10<bits<7> op> {
   defvar ps = !cast<SOPP_Pseudo>(NAME);
   def _gfx10 : SOPP_Real_32<op, ps>,
-               Select_gfx10<ps.Mnemonic>,
+               Select_gfx10<ps.PseudoInstr>,
                SOPPRelaxTable<0, ps.KeyName, "_gfx10">;
 }
 
@@ -2691,21 +2691,21 @@ multiclass SOPP_Real_32_gfx10_gfx11_gfx12<bits<7> op> :
 multiclass SOPP_Real_64_gfx6_gfx7<bits<7> op> {
   defvar ps = !cast<SOPP_Pseudo>(NAME);
   def _gfx6_gfx7 : SOPP_Real_64<op, ps>,
-                   Select_gfx6_gfx7<ps.Mnemonic>,
+                   Select_gfx6_gfx7<ps.PseudoInstr>,
                    SOPPRelaxTable<1, ps.KeyName, "_gfx6_gfx7">;
 }
 
 multiclass SOPP_Real_64_gfx8_gfx9<bits<7> op> {
   defvar ps = !cast<SOPP_Pseudo>(NAME);
   def _vi : SOPP_Real_64<op, ps>,
-            Select_vi<ps.Mnemonic>,
+            Select_vi<ps.PseudoInstr>,
             SOPPRelaxTable<1, ps.KeyName, "_vi">;
 }
 
 multiclass SOPP_Real_64_gfx10<bits<7> op> {
   defvar ps = !cast<SOPP_Pseudo>(NAME);
   def _gfx10 : SOPP_Real_64<op, ps>,
-               Select_gfx10<ps.Mnemonic>,
+               Select_gfx10<ps.PseudoInstr>,
                SOPPRelaxTable<1, ps.KeyName, "_gfx10">;
 }
 
@@ -2771,12 +2771,12 @@ defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_
 
 multiclass SOPC_Real_gfx12<bits<7> op> {
   def _gfx12 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>,
-               Select_gfx12<!cast<SOPC_Pseudo>(NAME).Mnemonic>;
+               Select_gfx12<!cast<SOPC_Pseudo>(NAME).PseudoInstr>;
 }
 
 multiclass SOPC_Real_gfx11<bits<7> op> {
   def _gfx11 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>,
-               Select_gfx11<!cast<SOPC_Pseudo>(NAME).Mnemonic>;
+               Select_gfx11<!cast<SOPC_Pseudo>(NAME).PseudoInstr>;
 }
 
 multiclass SOPC_Real_gfx11_gfx12<bits<7> op> :
@@ -2826,19 +2826,19 @@ defm S_CMP_NLT_F16 : SOPC_Real_gfx11_gfx12<0x5e>;
 multiclass SOPC_Real_gfx6_gfx7<bits<7> op> {
   defvar ps = !cast<SOPC_Pseudo>(NAME);
   def _gfx6_gfx7 : SOPC_Real<op, ps>,
-                   Select_gfx6_gfx7<ps.Mnemonic>;
+                   Select_gfx6_gfx7<ps.PseudoInstr>;
 }
 
 multiclass SOPC_Real_gfx8_gfx9<bits<7> op> {
   defvar ps = !cast<SOPC_Pseudo>(NAME);
   def _vi : SOPC_Real<op, ps>,
-            Select_vi<ps.Mnemonic>;
+            Select_vi<ps.PseudoInstr>;
 }
 
 multiclass SOPC_Real_gfx10<bits<7> op> {
   defvar ps = !cast<SOPC_Pseudo>(NAME);
   def _gfx10 : SOPC_Real<op, ps>,
-               Select_gfx10<ps.Mnemonic>;
+               Select_gfx10<ps.PseudoInstr>;
 }
 
 multiclass SOPC_Real_gfx8_gfx9_gfx10<bits<7> op> :
@@ -2878,15 +2878,15 @@ defm S_CMP_LG_U64     : SOPC_Real_gfx8_gfx9_gfx10<0x13>;
 
 class SOP1_Real_vi<bits<8> op, SOP1_Pseudo ps> :
   SOP1_Real<op, ps>,
-  Select_vi<ps.Mnemonic>;
+  Select_vi<ps.PseudoInstr>;
 
 class SOP2_Real_vi<bits<7> op, SOP2_Pseudo ps> :
   SOP2_Real32<op, ps>,
-  Select_vi<ps.Mnemonic>;
+  Select_vi<ps.PseudoInstr>;
 
 class SOPK_Real_vi<bits<5> op, SOPK_Pseudo ps> :
   SOPK_Real32<op, ps>,
-  Select_vi<ps.Mnemonic>;
+  Select_vi<ps.PseudoInstr>;
 
 def S_MOV_B32_vi           : SOP1_Real_vi <0x00, S_MOV_B32>;
 def S_MOV_B64_vi           : SOP1_Real_vi <0x01, S_MOV_B64>;
@@ -3007,7 +3007,7 @@ def S_GETREG_B32_vi        : SOPK_Real_vi <0x11, S_GETREG_B32>;
 def S_SETREG_B32_vi        : SOPK_Real_vi <0x12, S_SETREG_B32>;
 //def S_GETREG_REGRD_B32_vi  : SOPK_Real_vi <0x13, S_GETREG_REGRD_B32>; // see pseudo for comments
 def S_SETREG_IMM32_B32_vi  : SOPK_Real64<0x14, S_SETREG_IMM32_B32>,
-                             Select_vi<S_SETREG_IMM32_B32.Mnemonic>;
+                             Select_vi<S_SETREG_IMM32_B32.PseudoInstr>;
 
 def S_CALL_B64_vi          : SOPK_Real_vi <0x15, S_CALL_B64>;
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 5d44396..4e00744 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -182,6 +182,8 @@ unsigned getAMDHSACodeObjectVersion(unsigned ABIVersion) {
     return 4;
   case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
     return 5;
+  case ELF::ELFABIVERSION_AMDGPU_HSA_V6:
+    return 6;
   default:
     return getDefaultAMDHSACodeObjectVersion();
   }
@@ -496,9 +498,7 @@ bool isVOPC64DPP(unsigned Opc) {
   return isVOPC64DPPOpcodeHelper(Opc) || isVOPC64DPP8OpcodeHelper(Opc);
 }
 
-bool isVOPCAsmOnly(unsigned Opc) {
-  return isVOPCAsmOnlyOpcodeHelper(Opc) || isVOP3CAsmOnlyOpcodeHelper(Opc);
-}
+bool isVOPCAsmOnly(unsigned Opc) { return isVOPCAsmOnlyOpcodeHelper(Opc); }
 
 bool getMAIIsDGEMM(unsigned Opc) {
   const MAIInstInfo *Info = getMAIInstInfoHelper(Opc);
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index f136a43..c001c5d 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -503,6 +503,7 @@ def VOP_MAC_F16_t16 : VOP_MAC <f16> {
                      dpp8:$dpp8, Dpp8FI:$fi);
   let Src2Mod = FP32InputMods; // dummy unused modifiers
   let Src2RC64 = VGPRSrc_32;   // stub argument
+  let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 1/*IsFake16*/>.ret;
 }
 def VOP_MAC_F32 : VOP_MAC <f32>;
 let HasExtDPP = 0, HasExt32BitDPP = 0 in
@@ -618,7 +619,7 @@ class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT> {
   let AsmVOP3Base = "$vdst, $src0_modifiers, $src1_modifiers, $src2";
 
   let Outs32 = (outs DstRC:$vdst);
-  let Outs64 = (outs DstRC:$vdst);
+  let Outs64 = (outs DstRC64:$vdst);
 
   // Suppress src2 implied by type since the 32-bit encoding uses an
   // implicit VCC use.
@@ -652,7 +653,7 @@ class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT> {
                      dpp8:$dpp8, Dpp8FI:$fi);
 
   let Src0ModVOP3DPP = FPVRegInputMods;
-  let Src1ModVOP3DPP = FPVRegInputMods;
+  let Src1ModVOP3DPP = FP32VCSrcInputMods;
 
   let HasExt = 1;
   let HasExtDPP = 1;
@@ -662,7 +663,17 @@ class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT> {
 }
 
 def VOP2e_I32_I32_I32_I1 : VOP2e_SGPR<[i32, i32, i32, i1]>;
-def VOP2e_I16_I16_I16_I1 : VOP2e_SGPR<[i16, i16, i16, i1]>;
+def VOP2e_I16_I16_I16_I1_fake16 : VOP2e_SGPR<[i16, i16, i16, i1]> {
+  let IsTrue16 = 1;
+  let DstRC64 = getVALUDstForVT<DstVT>.ret;
+
+  let Src0Mod = getSrcMod<f16>.ret;
+  let Src1Mod = getSrcMod<f16>.ret;
+
+  let Src0VOP3DPP = VGPRSrc_32;
+  let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT>.ret;
+  let Src1ModVOP3DPP = getSrcModVOP3DPP<f16, 1/*IsFake16*/>.ret;
+}
 
 def VOP_READLANE : VOPProfile<[i32, i32, i32, untyped]> {
   let Outs32 = (outs SReg_32:$vdst);
@@ -703,7 +714,7 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
 //===----------------------------------------------------------------------===//
 
 let SubtargetPredicate = isGFX11Plus in
-defm V_CNDMASK_B16 : VOP2eInst <"v_cndmask_b16", VOP2e_I16_I16_I16_I1>;
+defm V_CNDMASK_B16 : VOP2eInst <"v_cndmask_b16", VOP2e_I16_I16_I16_I1_fake16>;
 defm V_CNDMASK_B32 : VOP2eInst_VOPD <"v_cndmask_b32", VOP2e_I32_I32_I32_I1, 0x9, "v_cndmask_b32">;
 let SubtargetPredicate = HasMadMacF32Insts, isReMaterializable = 1 in
 def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>;
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 022fb7c..0b3a3d5 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -772,7 +772,7 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType src0VT, ValueType
   // DPP8 forbids modifiers and can inherit from VOPC_Profile
 
   let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
-  dag InsPartVOP3DPP = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, VRegSrc_32:$src1);
+  dag InsPartVOP3DPP = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, VCSrc_b32:$src1);
   let InsVOP3Base = !con(InsPartVOP3DPP, !if(HasOpSel, (ins op_sel0:$op_sel),
                                                        (ins)));
   let AsmVOP3Base = "$sdst, $src0_modifiers, $src1";
@@ -1377,31 +1377,9 @@ multiclass VOPC_Real_Base<GFXGen Gen, bits<9> op> {
     }
     if ps64.Pfl.HasExtVOP3DPP then {
       defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e64" #"_dpp");
-      defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
       def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP>,
                                 SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>;
-      def _e64_dpp_w32#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> {
-        let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
-        let isAsmParserOnly = 1;
-        let WaveSizePredicate = isWave32;
-      }
-      def _e64_dpp_w64#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> {
-        let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
-        let isAsmParserOnly = 1;
-        let WaveSizePredicate = isWave64;
-      }
-      defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
       def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64>;
-      def _e64_dpp8_w32#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> {
-        let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
-        let isAsmParserOnly = 1;
-        let WaveSizePredicate = isWave32;
-      }
-      def _e64_dpp8_w64#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> {
-        let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
-        let isAsmParserOnly = 1;
-        let WaveSizePredicate = isWave64;
-      }
     }
   } // AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace
 }
@@ -1472,35 +1450,9 @@ multiclass VOPC_Real_with_name<GFXGen Gen, bits<9> op, string OpName,
 
     if ps64.Pfl.HasExtVOP3DPP then {
       defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e64" #"_dpp");
-      defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
       def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>,
                                 SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>;
-      def _e64_dpp_w32#Gen.Suffix
-          : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
-        let AsmString = asm_name # " vcc_lo, " # AsmDPP;
-        let isAsmParserOnly = 1;
-        let WaveSizePredicate = isWave32;
-      }
-      def _e64_dpp_w64#Gen.Suffix
-          : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
-        let AsmString = asm_name # " vcc, " # AsmDPP;
-        let isAsmParserOnly = 1;
-        let WaveSizePredicate = isWave64;
-      }
-      defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
       def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>;
-      def _e64_dpp8_w32#Gen.Suffix
-          : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
-        let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
-        let isAsmParserOnly = 1;
-        let WaveSizePredicate = isWave32;
-      }
-      def _e64_dpp8_w64#Gen.Suffix
-          : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
-        let AsmString = asm_name # " vcc, " # AsmDPP8;
-        let isAsmParserOnly = 1;
-        let WaveSizePredicate = isWave64;
-      }
     }
   } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace
 }
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index a6272e9..60e91c7 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1680,7 +1680,6 @@ class AsmOnlyInfoTable <string Format, string Class>: GenericTable {
 }
 
 def VOPCAsmOnlyInfoTable : AsmOnlyInfoTable <"VOPC", "VOPC_DPPe_Common">;
-def VOP3CAsmOnlyInfoTable : AsmOnlyInfoTable <"VOP3C", "VOP3_DPPe_Common_Base">;
 
 def VOPTrue16Table : GenericTable {
   let FilterClass = "VOP_Pseudo";
diff --git a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
index 9c29acb..bef7607 100644
--- a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -153,15 +153,15 @@ class SELNEZ_ENC : SPECIAL_3R_FM<0b00000, 0b110111>;
 
 class LWPC_ENC   : PCREL19_FM<OPCODE2_LWPC>;
 
-class MAX_S_ENC : COP1_3R_FM<0b011101, FIELD_FMT_S>;
-class MAX_D_ENC : COP1_3R_FM<0b011101, FIELD_FMT_D>;
+class MAX_S_ENC : COP1_3R_FM<0b011110, FIELD_FMT_S>;
+class MAX_D_ENC : COP1_3R_FM<0b011110, FIELD_FMT_D>;
 class MIN_S_ENC : COP1_3R_FM<0b011100, FIELD_FMT_S>;
 class MIN_D_ENC : COP1_3R_FM<0b011100, FIELD_FMT_D>;
 
 class MAXA_S_ENC : COP1_3R_FM<0b011111, FIELD_FMT_S>;
 class MAXA_D_ENC : COP1_3R_FM<0b011111, FIELD_FMT_D>;
-class MINA_S_ENC : COP1_3R_FM<0b011110, FIELD_FMT_S>;
-class MINA_D_ENC : COP1_3R_FM<0b011110, FIELD_FMT_D>;
+class MINA_S_ENC : COP1_3R_FM<0b011101, FIELD_FMT_S>;
+class MINA_D_ENC : COP1_3R_FM<0b011101, FIELD_FMT_D>;
 
 class SELEQZ_S_ENC : COP1_3R_FM<0b010100, FIELD_FMT_S>;
 class SELEQZ_D_ENC : COP1_3R_FM<0b010100, FIELD_FMT_D>;
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 7f35107..38c1f9868 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -139,20 +139,21 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
       .clampScalar(0, s32, sXLen)
       .minScalarSameAs(1, 0);
 
+  auto &ExtActions =
+      getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
+          .legalIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST),
+                       typeIsLegalIntOrFPVec(1, IntOrFPVecTys, ST)));
   if (ST.is64Bit()) {
-    getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
-        .legalFor({{sXLen, s32}})
-        .maxScalar(0, sXLen);
-
+    ExtActions.legalFor({{sXLen, s32}});
     getActionDefinitionsBuilder(G_SEXT_INREG)
         .customFor({sXLen})
         .maxScalar(0, sXLen)
         .lower();
   } else {
-    getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT}).maxScalar(0, sXLen);
-
     getActionDefinitionsBuilder(G_SEXT_INREG).maxScalar(0, sXLen).lower();
   }
+  ExtActions.customIf(typeIsLegalBoolVec(1, BoolVecTys, ST))
+      .maxScalar(0, sXLen);
 
   // Merge/Unmerge
   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
@@ -235,7 +236,9 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
 
   getActionDefinitionsBuilder(G_ICMP)
       .legalFor({{sXLen, sXLen}, {sXLen, p0}})
-      .widenScalarToNextPow2(1)
+      .legalIf(all(typeIsLegalBoolVec(0, BoolVecTys, ST),
+                   typeIsLegalIntOrFPVec(1, IntOrFPVecTys, ST)))
+      .widenScalarOrEltToNextPow2OrMinSize(1, 8)
       .clampScalar(1, sXLen, sXLen)
       .clampScalar(0, sXLen, sXLen);
 
@@ -418,6 +421,29 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
       .clampScalar(0, sXLen, sXLen)
       .customFor({sXLen});
 
+  auto &SplatActions =
+      getActionDefinitionsBuilder(G_SPLAT_VECTOR)
+          .legalIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST),
+                       typeIs(1, sXLen)))
+          .customIf(all(typeIsLegalBoolVec(0, BoolVecTys, ST), typeIs(1, s1)));
+  // Handle case of s64 element vectors on RV32. If the subtarget does not have
+  // f64, then try to lower it to G_SPLAT_VECTOR_SPLIT_64_VL. If the subtarget
+  // does have f64, then we don't know whether the type is an f64 or an i64,
+  // so mark the G_SPLAT_VECTOR as legal and decide later what to do with it,
+  // depending on how the instructions it consumes are legalized. They are not
+  // legalized yet since legalization is in reverse postorder, so we cannot
+  // make the decision at this moment.
+  if (XLen == 32) {
+    if (ST.hasVInstructionsF64() && ST.hasStdExtD())
+      SplatActions.legalIf(all(
+          typeInSet(0, {nxv1s64, nxv2s64, nxv4s64, nxv8s64}), typeIs(1, s64)));
+    else if (ST.hasVInstructionsI64())
+      SplatActions.customIf(all(
+          typeInSet(0, {nxv1s64, nxv2s64, nxv4s64, nxv8s64}), typeIs(1, s64)));
+  }
+
+  SplatActions.clampScalar(1, sXLen, sXLen);
+
   getLegacyLegalizerInfo().computeTables();
 }
 
@@ -576,7 +602,145 @@ bool RISCVLegalizerInfo::legalizeVScale(MachineInstr &MI,
     auto VScale = MIB.buildLShr(XLenTy, VLENB, MIB.buildConstant(XLenTy, 3));
     MIB.buildMul(Dst, VScale, MIB.buildConstant(XLenTy, Val));
   }
+  MI.eraseFromParent();
+  return true;
+}
+
+// Custom-lower extensions from mask vectors by using a vselect either with 1
+// for zero/any-extension or -1 for sign-extension:
+//   (vXiN = (s|z)ext vXi1:vmask) -> (vXiN = vselect vmask, (-1 or 1), 0)
+// Note that any-extension is lowered identically to zero-extension.
+bool RISCVLegalizerInfo::legalizeExt(MachineInstr &MI,
+                                     MachineIRBuilder &MIB) const {
+
+  unsigned Opc = MI.getOpcode();
+  assert(Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_SEXT ||
+         Opc == TargetOpcode::G_ANYEXT);
+
+  MachineRegisterInfo &MRI = *MIB.getMRI();
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+
+  LLT DstTy = MRI.getType(Dst);
+  int64_t ExtTrueVal = Opc == TargetOpcode::G_SEXT ? -1 : 1;
+  LLT DstEltTy = DstTy.getElementType();
+  auto SplatZero = MIB.buildSplatVector(DstTy, MIB.buildConstant(DstEltTy, 0));
+  auto SplatTrue =
+      MIB.buildSplatVector(DstTy, MIB.buildConstant(DstEltTy, ExtTrueVal));
+  MIB.buildSelect(Dst, Src, SplatTrue, SplatZero);
+
+  MI.eraseFromParent();
+  return true;
+}
+
+/// Return the type of the mask type suitable for masking the provided
+/// vector type.  This is simply an i1 element type vector of the same
+/// (possibly scalable) length.
+static LLT getMaskTypeFor(LLT VecTy) {
+  assert(VecTy.isVector());
+  ElementCount EC = VecTy.getElementCount();
+  return LLT::vector(EC, LLT::scalar(1));
+}
+
+/// Creates an all ones mask suitable for masking a vector of type VecTy with
+/// vector length VL.
+static MachineInstrBuilder buildAllOnesMask(LLT VecTy, const SrcOp &VL,
+                                            MachineIRBuilder &MIB,
+                                            MachineRegisterInfo &MRI) {
+  LLT MaskTy = getMaskTypeFor(VecTy);
+  return MIB.buildInstr(RISCV::G_VMSET_VL, {MaskTy}, {VL});
+}
+
+/// Gets the two common "VL" operands: an all-ones mask and the vector length.
+/// VecTy is a scalable vector type.
+static std::pair<MachineInstrBuilder, Register>
+buildDefaultVLOps(const DstOp &Dst, MachineIRBuilder &MIB,
+                  MachineRegisterInfo &MRI) {
+  LLT VecTy = Dst.getLLTTy(MRI);
+  assert(VecTy.isScalableVector() && "Expecting scalable container type");
+  Register VL(RISCV::X0);
+  MachineInstrBuilder Mask = buildAllOnesMask(VecTy, VL, MIB, MRI);
+  return {Mask, VL};
+}
+
+static MachineInstrBuilder
+buildSplatPartsS64WithVL(const DstOp &Dst, const SrcOp &Passthru, Register Lo,
+                         Register Hi, Register VL, MachineIRBuilder &MIB,
+                         MachineRegisterInfo &MRI) {
+  // TODO: If the Hi bits of the splat are undefined, then it's fine to just
+  // splat Lo even if it might be sign extended. I don't think we have
+  // introduced a case where we're build a s64 where the upper bits are undef
+  // yet.
+
+  // Fall back to a stack store and stride x0 vector load.
+  // TODO: need to lower G_SPLAT_VECTOR_SPLIT_I64. This is done in
+  // preprocessDAG in SDAG.
+  return MIB.buildInstr(RISCV::G_SPLAT_VECTOR_SPLIT_I64_VL, {Dst},
+                        {Passthru, Lo, Hi, VL});
+}
+
+static MachineInstrBuilder
+buildSplatSplitS64WithVL(const DstOp &Dst, const SrcOp &Passthru,
+                         const SrcOp &Scalar, Register VL,
+                         MachineIRBuilder &MIB, MachineRegisterInfo &MRI) {
+  assert(Scalar.getLLTTy(MRI) == LLT::scalar(64) && "Unexpected VecTy!");
+  auto Unmerge = MIB.buildUnmerge(LLT::scalar(32), Scalar);
+  return buildSplatPartsS64WithVL(Dst, Passthru, Unmerge.getReg(0),
+                                  Unmerge.getReg(1), VL, MIB, MRI);
+}
+
+// Lower splats of s1 types to G_ICMP. For each mask vector type, we have a
+// legal equivalently-sized i8 type, so we can use that as a go-between.
+// Splats of s1 types that have constant value can be legalized as VMSET_VL or
+// VMCLR_VL.
+bool RISCVLegalizerInfo::legalizeSplatVector(MachineInstr &MI,
+                                             MachineIRBuilder &MIB) const {
+  assert(MI.getOpcode() == TargetOpcode::G_SPLAT_VECTOR);
+
+  MachineRegisterInfo &MRI = *MIB.getMRI();
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register SplatVal = MI.getOperand(1).getReg();
+
+  LLT VecTy = MRI.getType(Dst);
+  LLT XLenTy(STI.getXLenVT());
+
+  // Handle case of s64 element vectors on rv32
+  if (XLenTy.getSizeInBits() == 32 &&
+      VecTy.getElementType().getSizeInBits() == 64) {
+    auto [_, VL] = buildDefaultVLOps(Dst, MIB, MRI);
+    buildSplatSplitS64WithVL(Dst, MIB.buildUndef(VecTy), SplatVal, VL, MIB,
+                             MRI);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // All-zeros or all-ones splats are handled specially.
+  MachineInstr &SplatValMI = *MRI.getVRegDef(SplatVal);
+  if (isAllOnesOrAllOnesSplat(SplatValMI, MRI)) {
+    auto VL = buildDefaultVLOps(VecTy, MIB, MRI).second;
+    MIB.buildInstr(RISCV::G_VMSET_VL, {Dst}, {VL});
+    MI.eraseFromParent();
+    return true;
+  }
+  if (isNullOrNullSplat(SplatValMI, MRI)) {
+    auto VL = buildDefaultVLOps(VecTy, MIB, MRI).second;
+    MIB.buildInstr(RISCV::G_VMCLR_VL, {Dst}, {VL});
+    MI.eraseFromParent();
+    return true;
+  }
 
+  // Handle non-constant mask splat (i.e. not sure if it's all zeros or all
+  // ones) by promoting it to an s8 splat.
+  LLT InterEltTy = LLT::scalar(8);
+  LLT InterTy = VecTy.changeElementType(InterEltTy);
+  auto ZExtSplatVal = MIB.buildZExt(InterEltTy, SplatVal);
+  auto And =
+      MIB.buildAnd(InterEltTy, ZExtSplatVal, MIB.buildConstant(InterEltTy, 1));
+  auto LHS = MIB.buildSplatVector(InterTy, And);
+  auto ZeroSplat =
+      MIB.buildSplatVector(InterTy, MIB.buildConstant(InterEltTy, 0));
+  MIB.buildICmp(CmpInst::Predicate::ICMP_NE, Dst, LHS, ZeroSplat);
   MI.eraseFromParent();
   return true;
 }
@@ -640,6 +804,12 @@ bool RISCVLegalizerInfo::legalizeCustom(
     return legalizeVAStart(MI, MIRBuilder);
   case TargetOpcode::G_VSCALE:
     return legalizeVScale(MI, MIRBuilder);
+  case TargetOpcode::G_ZEXT:
+  case TargetOpcode::G_SEXT:
+  case TargetOpcode::G_ANYEXT:
+    return legalizeExt(MI, MIRBuilder);
+  case TargetOpcode::G_SPLAT_VECTOR:
+    return legalizeSplatVector(MI, MIRBuilder);
   }
 
   llvm_unreachable("expected switch to return");
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
index e2a98c8..5bb1e7a 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
@@ -43,6 +43,8 @@ private:
 
   bool legalizeVAStart(MachineInstr &MI, MachineIRBuilder &MIRBuilder) const;
   bool legalizeVScale(MachineInstr &MI, MachineIRBuilder &MIB) const;
+  bool legalizeExt(MachineInstr &MI, MachineIRBuilder &MIRBuilder) const;
+  bool legalizeSplatVector(MachineInstr &MI, MachineIRBuilder &MIB) const;
 };
 } // end namespace llvm
 #endif
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
index 888bcc4..86e4434 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
@@ -290,16 +290,7 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
 
   switch (Opc) {
   case TargetOpcode::G_ADD:
-  case TargetOpcode::G_SUB: {
-    if (MRI.getType(MI.getOperand(0).getReg()).isVector()) {
-      LLT Ty = MRI.getType(MI.getOperand(0).getReg());
-      return getInstructionMapping(
-          DefaultMappingID, /*Cost=*/1,
-          getVRBValueMapping(Ty.getSizeInBits().getKnownMinValue()),
-          NumOperands);
-    }
-  }
-    LLVM_FALLTHROUGH;
+  case TargetOpcode::G_SUB:
   case TargetOpcode::G_SHL:
   case TargetOpcode::G_ASHR:
   case TargetOpcode::G_LSHR:
@@ -320,14 +311,6 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case TargetOpcode::G_PTR_ADD:
   case TargetOpcode::G_PTRTOINT:
   case TargetOpcode::G_INTTOPTR:
-  case TargetOpcode::G_TRUNC:
-  case TargetOpcode::G_ANYEXT:
-  case TargetOpcode::G_SEXT:
-  case TargetOpcode::G_ZEXT:
-  case TargetOpcode::G_SEXTLOAD:
-  case TargetOpcode::G_ZEXTLOAD:
-    return getInstructionMapping(DefaultMappingID, /*Cost=*/1, GPRValueMapping,
-                                 NumOperands);
   case TargetOpcode::G_FADD:
   case TargetOpcode::G_FSUB:
   case TargetOpcode::G_FMUL:
@@ -338,25 +321,48 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case TargetOpcode::G_FMAXNUM:
   case TargetOpcode::G_FMINNUM: {
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
-    return getInstructionMapping(DefaultMappingID, /*Cost=*/1,
-                                 getFPValueMapping(Ty.getSizeInBits()),
-                                 NumOperands);
+    TypeSize Size = Ty.getSizeInBits();
+
+    const ValueMapping *Mapping;
+    if (Ty.isVector())
+      Mapping = getVRBValueMapping(Size.getKnownMinValue());
+    else if (isPreISelGenericFloatingPointOpcode(Opc))
+      Mapping = getFPValueMapping(Size.getFixedValue());
+    else
+      Mapping = GPRValueMapping;
+
+#ifndef NDEBUG
+    // Make sure all the operands are using similar size and type.
+    for (unsigned Idx = 1; Idx != NumOperands; ++Idx) {
+      LLT OpTy = MRI.getType(MI.getOperand(Idx).getReg());
+      assert(Ty.isVector() == OpTy.isVector() &&
+             "Operand has incompatible type");
+      // Don't check size for GPR.
+      if (OpTy.isVector() || isPreISelGenericFloatingPointOpcode(Opc))
+        assert(Size == OpTy.getSizeInBits() && "Operand has incompatible size");
+    }
+#endif // End NDEBUG
+
+    return getInstructionMapping(DefaultMappingID, 1, Mapping, NumOperands);
   }
+  case TargetOpcode::G_SEXTLOAD:
+  case TargetOpcode::G_ZEXTLOAD:
+    return getInstructionMapping(DefaultMappingID, /*Cost=*/1, GPRValueMapping,
+                                 NumOperands);
   case TargetOpcode::G_IMPLICIT_DEF: {
     Register Dst = MI.getOperand(0).getReg();
     LLT DstTy = MRI.getType(Dst);
-    uint64_t DstMinSize = DstTy.getSizeInBits().getKnownMinValue();
+    unsigned DstMinSize = DstTy.getSizeInBits().getKnownMinValue();
     auto Mapping = GPRValueMapping;
     // FIXME: May need to do a better job determining when to use FPRB.
     // For example, the look through COPY case:
     // %0:_(s32) = G_IMPLICIT_DEF
     // %1:_(s32) = COPY %0
     // $f10_d = COPY %1(s32)
-    if (anyUseOnlyUseFP(Dst, MRI, TRI))
-      Mapping = getFPValueMapping(DstMinSize);
-
     if (DstTy.isVector())
       Mapping = getVRBValueMapping(DstMinSize);
+    else if (anyUseOnlyUseFP(Dst, MRI, TRI))
+      Mapping = getFPValueMapping(DstMinSize);
 
     return getInstructionMapping(DefaultMappingID, /*Cost=*/1, Mapping,
                                  NumOperands);
@@ -529,7 +535,10 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
        if (!Ty.isValid())
          continue;
 
-       if (isPreISelGenericFloatingPointOpcode(Opc))
+       if (Ty.isVector())
+         OpdsMapping[Idx] =
+             getVRBValueMapping(Ty.getSizeInBits().getKnownMinValue());
+       else if (isPreISelGenericFloatingPointOpcode(Opc))
          OpdsMapping[Idx] = getFPValueMapping(Ty.getSizeInBits());
        else
          OpdsMapping[Idx] = GPRValueMapping;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 55ba494..f99dc0b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3287,24 +3287,24 @@ bool RISCVDAGToDAGISel::selectVSplatUimm(SDValue N, unsigned Bits,
 }
 
 bool RISCVDAGToDAGISel::selectLow8BitsVSplat(SDValue N, SDValue &SplatVal) {
-  // Truncates are custom lowered during legalization.
-  auto IsTrunc = [this](SDValue N) {
-    if (N->getOpcode() != RISCVISD::TRUNCATE_VECTOR_VL)
+  auto IsExtOrTrunc = [](SDValue N) {
+    switch (N->getOpcode()) {
+    case ISD::SIGN_EXTEND:
+    case ISD::ZERO_EXTEND:
+    // There's no passthru on these _VL nodes so any VL/mask is ok, since any
+    // inactive elements will be undef.
+    case RISCVISD::TRUNCATE_VECTOR_VL:
+    case RISCVISD::VSEXT_VL:
+    case RISCVISD::VZEXT_VL:
+      return true;
+    default:
       return false;
-    SDValue VL;
-    selectVLOp(N->getOperand(2), VL);
-    // Any vmset_vl is ok, since any bits past VL are undefined and we can
-    // assume they are set.
-    return N->getOperand(1).getOpcode() == RISCVISD::VMSET_VL &&
-           isa<ConstantSDNode>(VL) &&
-           cast<ConstantSDNode>(VL)->getSExtValue() == RISCV::VLMaxSentinel;
+    }
   };
 
-  // We can have multiple nested truncates, so unravel them all if needed.
-  while (N->getOpcode() == ISD::SIGN_EXTEND ||
-         N->getOpcode() == ISD::ZERO_EXTEND || IsTrunc(N)) {
-    if (!N.hasOneUse() ||
-        N.getValueType().getSizeInBits().getKnownMinValue() < 8)
+  // We can have multiple nested nodes, so unravel them all if needed.
+  while (IsExtOrTrunc(N)) {
+    if (!N.hasOneUse() || N.getScalarValueSizeInBits() < 8)
       return false;
     N = N->getOperand(0);
   }
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index ee83f9d..279d8a4 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21115,12 +21115,10 @@ void RVVArgDispatcher::constructArgInfos(ArrayRef<Type *> TypeList) {
             RegisterVT.getVectorElementType() == MVT::i1) {
           RVVArgInfos.push_back({1, RegisterVT, true});
           FirstVMaskAssigned = true;
-        } else {
-          RVVArgInfos.push_back({1, RegisterVT, false});
+          --NumRegs;
         }
 
-        RVVArgInfos.insert(RVVArgInfos.end(), --NumRegs,
-                           {1, RegisterVT, false});
+        RVVArgInfos.insert(RVVArgInfos.end(), NumRegs, {1, RegisterVT, false});
       }
     }
   }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrGISel.td b/llvm/lib/Target/RISCV/RISCVInstrGISel.td
index 54e22d6..ba40662 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrGISel.td
@@ -32,3 +32,28 @@ def G_READ_VLENB : RISCVGenericInstruction {
   let hasSideEffects = false;
 }
 def : GINodeEquiv<G_READ_VLENB, riscv_read_vlenb>;
+
+// Pseudo equivalent to a RISCVISD::VMCLR_VL
+def G_VMCLR_VL : RISCVGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$vl);
+  let hasSideEffects = false;
+}
+def : GINodeEquiv<G_VMCLR_VL, riscv_vmclr_vl>;
+
+// Pseudo equivalent to a RISCVISD::VMSET_VL
+def G_VMSET_VL : RISCVGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$vl);
+  let hasSideEffects = false;
+}
+def : GINodeEquiv<G_VMSET_VL, riscv_vmset_vl>;
+
+// Pseudo equivalent to a RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL. There is no
+// record to mark as equivalent to using GINodeEquiv because it gets lowered
+// before instruction selection.
+def G_SPLAT_VECTOR_SPLIT_I64_VL : RISCVGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$passthru, type1:$hi, type1:$lo, type2:$vl);
+  let hasSideEffects = false;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index cc44092..73d52d5 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -387,6 +387,9 @@ def SDT_RISCVVEXTEND_VL : SDTypeProfile<1, 3, [SDTCisVec<0>,
                                                SDTCisVT<3, XLenVT>]>;
 def riscv_sext_vl : SDNode<"RISCVISD::VSEXT_VL", SDT_RISCVVEXTEND_VL>;
 def riscv_zext_vl : SDNode<"RISCVISD::VZEXT_VL", SDT_RISCVVEXTEND_VL>;
+def riscv_ext_vl : PatFrags<(ops node:$A, node:$B, node:$C),
+                            [(riscv_sext_vl node:$A, node:$B, node:$C),
+                             (riscv_zext_vl node:$A, node:$B, node:$C)]>;
 
 def riscv_trunc_vector_vl : SDNode<"RISCVISD::TRUNCATE_VECTOR_VL",
                                    SDTypeProfile<1, 3, [SDTCisVec<0>,
@@ -535,6 +538,11 @@ def riscv_zext_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C),
   return N->hasOneUse();
 }]>;
 
+def riscv_ext_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C),
+                          (riscv_ext_vl node:$A, node:$B, node:$C), [{
+  return N->hasOneUse();
+}]>;
+
 def riscv_fpextend_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C),
                            (riscv_fpextend_vl node:$A, node:$B, node:$C), [{
   return N->hasOneUse();
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index 51a7a0a1..c1facc79 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -630,6 +630,19 @@ foreach vtiToWti = AllWidenableIntVectors in {
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_shl_vl
+                 (wti.Vector (riscv_zext_vl_oneuse
+                                (vti.Vector vti.RegClass:$rs2),
+                                (vti.Mask V0), VLOpFrag)),
+                 (wti.Vector (riscv_ext_vl_oneuse
+                                (vti.Vector vti.RegClass:$rs1),
+                                (vti.Mask V0), VLOpFrag)),
+                 (wti.Vector wti.RegClass:$merge),
+                 (vti.Mask V0), VLOpFrag),
+              (!cast<Instruction>("PseudoVWSLL_VV_"#vti.LMul.MX#"_MASK")
+                 wti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+
+    def : Pat<(riscv_shl_vl
                  (wti.Vector (zext_oneuse (vti.Vector vti.RegClass:$rs2))),
                  (wti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))),
                  (wti.Vector wti.RegClass:$merge),
@@ -639,6 +652,17 @@ foreach vtiToWti = AllWidenableIntVectors in {
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_shl_vl
+                 (wti.Vector (riscv_zext_vl_oneuse
+                                (vti.Vector vti.RegClass:$rs2),
+                                (vti.Mask V0), VLOpFrag)),
+                 (wti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))),
+                 (wti.Vector wti.RegClass:$merge),
+                 (vti.Mask V0), VLOpFrag),
+              (!cast<Instruction>("PseudoVWSLL_VX_"#vti.LMul.MX#"_MASK")
+                 wti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
+                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+
+    def : Pat<(riscv_shl_vl
                  (wti.Vector (zext_oneuse (vti.Vector vti.RegClass:$rs2))),
                  (wti.Vector (SplatPat_uimm5 uimm5:$rs1)),
                  (wti.Vector wti.RegClass:$merge),
@@ -647,6 +671,17 @@ foreach vtiToWti = AllWidenableIntVectors in {
                  wti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
+    def : Pat<(riscv_shl_vl
+                 (wti.Vector (riscv_zext_vl_oneuse
+                                (vti.Vector vti.RegClass:$rs2),
+                                (vti.Mask V0), VLOpFrag)),
+                 (wti.Vector (SplatPat_uimm5 uimm5:$rs1)),
+                 (wti.Vector wti.RegClass:$merge),
+                 (vti.Mask V0), VLOpFrag),
+              (!cast<Instruction>("PseudoVWSLL_VI_"#vti.LMul.MX#"_MASK")
+                 wti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$rs1,
+                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+
     def : Pat<(riscv_vwsll_vl
                  (vti.Vector vti.RegClass:$rs2),
                  (vti.Vector vti.RegClass:$rs1),
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index ba108912..85f8f5f 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -254,6 +254,7 @@ public:
   const LegalizerInfo *getLegalizerInfo() const override;
   const RegisterBankInfo *getRegBankInfo() const override;
 
+  bool isTargetAndroid() const { return getTargetTriple().isAndroid(); }
   bool isTargetFuchsia() const { return getTargetTriple().isOSFuchsia(); }
 
   bool useConstantPoolForLargeInts() const;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 38304ff..aeec063 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -245,6 +245,10 @@ RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
   return TTI::TCC_Free;
 }
 
+bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const {
+  return ST->hasVInstructions();
+}
+
 TargetTransformInfo::PopcntSupportKind
 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
@@ -861,9 +865,14 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   }
   // TODO: add more intrinsic
   case Intrinsic::experimental_stepvector: {
-    unsigned Cost = 1; // vid
     auto LT = getTypeLegalizationCost(RetTy);
-    return Cost + (LT.first - 1);
+    // Legalisation of illegal types involves an `index' instruction plus
+    // (LT.first - 1) vector adds.
+    if (ST->hasVInstructions())
+      return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
+             (LT.first - 1) *
+                 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
+    return 1 + (LT.first - 1);
   }
   case Intrinsic::vp_rint: {
     // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index ac32aea..c0169ea 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -78,6 +78,22 @@ public:
                                       const APInt &Imm, Type *Ty,
                                       TTI::TargetCostKind CostKind);
 
+  /// \name EVL Support for predicated vectorization.
+  /// Whether the target supports the %evl parameter of VP intrinsic efficiently
+  /// in hardware, for the given opcode and type/alignment. (see LLVM Language
+  /// Reference - "Vector Predication Intrinsics",
+  /// https://llvm.org/docs/LangRef.html#vector-predication-intrinsics and
+  /// "IR-level VP intrinsics",
+  /// https://llvm.org/docs/Proposals/VectorPredication.html#ir-level-vp-intrinsics).
+  /// \param Opcode the opcode of the instruction checked for predicated version
+  /// support.
+  /// \param DataType the type of the instruction with the \p Opcode checked for
+  /// prediction support.
+  /// \param Alignment the alignment for memory access operation checked for
+  /// predicated version support.
+  bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
+                             Align Alignment) const;
+
   TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
 
   bool shouldExpandReduction(const IntrinsicInst *II) const;
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 1674cef..9e4ba21 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -243,8 +243,12 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
       continue;
 
     MetadataAsValue *VMD = cast<MetadataAsValue>(II->getOperand(1));
-    SPIRVType *ElementType = GR->getOrCreateSPIRVType(
-        cast<ConstantAsMetadata>(VMD->getMetadata())->getType(), MIRBuilder);
+    Type *ElementTy = cast<ConstantAsMetadata>(VMD->getMetadata())->getType();
+    if (isUntypedPointerTy(ElementTy))
+      ElementTy =
+          TypedPointerType::get(IntegerType::getInt8Ty(II->getContext()),
+                                getPointerAddressSpace(ElementTy));
+    SPIRVType *ElementType = GR->getOrCreateSPIRVType(ElementTy, MIRBuilder);
     return GR->getOrCreateSPIRVPointerType(
         ElementType, MIRBuilder,
         addressSpaceToStorageClass(
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index e0099e5..ac79937 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -47,7 +47,7 @@ class SPIRVGlobalRegistry {
   DenseMap<const MachineOperand *, const Function *> InstrToFunction;
   // Maps Functions to their calls (in a form of the machine instruction,
   // OpFunctionCall) that happened before the definition is available
-  DenseMap<const Function *, SmallVector<MachineInstr *>> ForwardCalls;
+  DenseMap<const Function *, SmallPtrSet<MachineInstr *, 8>> ForwardCalls;
 
   // Look for an equivalent of the newType in the map. Return the equivalent
   // if it's found, otherwise insert newType to the map and return the type.
@@ -215,12 +215,12 @@ public:
     if (It == ForwardCalls.end())
       ForwardCalls[F] = {MI};
     else
-      It->second.push_back(MI);
+      It->second.insert(MI);
   }
 
   // Map a Function to the vector of machine instructions that represents
   // forward function calls or to nullptr if not found.
-  SmallVector<MachineInstr *> *getForwardCalls(const Function *F) {
+  SmallPtrSet<MachineInstr *, 8> *getForwardCalls(const Function *F) {
     auto It = ForwardCalls.find(F);
     return It == ForwardCalls.end() ? nullptr : &It->second;
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
index 90a3155..d450078 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
@@ -193,7 +193,7 @@ void validateForwardCalls(const SPIRVSubtarget &STI,
                           MachineRegisterInfo *DefMRI, SPIRVGlobalRegistry &GR,
                           MachineInstr &FunDef) {
   const Function *F = GR.getFunctionByDefinition(&FunDef);
-  if (SmallVector<MachineInstr *> *FwdCalls = GR.getForwardCalls(F))
+  if (SmallPtrSet<MachineInstr *, 8> *FwdCalls = GR.getForwardCalls(F))
     for (MachineInstr *FunCall : *FwdCalls) {
       MachineRegisterInfo *CallMRI =
           &FunCall->getParent()->getParent()->getRegInfo();
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index f4525e71..49749b5 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -1825,7 +1825,24 @@ bool SPIRVInstructionSelector::selectAllocaArray(Register ResVReg,
 bool SPIRVInstructionSelector::selectFrameIndex(Register ResVReg,
                                                 const SPIRVType *ResType,
                                                 MachineInstr &I) const {
-  return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpVariable))
+  // Change order of instructions if needed: all OpVariable instructions in a
+  // function must be the first instructions in the first block
+  MachineFunction *MF = I.getParent()->getParent();
+  MachineBasicBlock *MBB = &MF->front();
+  auto It = MBB->SkipPHIsAndLabels(MBB->begin()), E = MBB->end();
+  bool IsHeader = false;
+  unsigned Opcode;
+  for (; It != E && It != I; ++It) {
+    Opcode = It->getOpcode();
+    if (Opcode == SPIRV::OpFunction || Opcode == SPIRV::OpFunctionParameter) {
+      IsHeader = true;
+    } else if (IsHeader &&
+               !(Opcode == SPIRV::ASSIGN_TYPE || Opcode == SPIRV::OpLabel)) {
+      ++It;
+      break;
+    }
+  }
+  return BuildMI(*MBB, It, It->getDebugLoc(), TII.get(SPIRV::OpVariable))
       .addDef(ResVReg)
       .addUse(GR.getSPIRVTypeID(ResType))
       .addImm(static_cast<uint32_t>(SPIRV::StorageClass::Function))
diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index 215a8ea..6855471 100644
--- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -434,6 +434,50 @@ bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     default:
       // See if this is a generic print operand
       return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O);
+    case 'L': // Low order register of a twin word register operand
+    case 'H': // High order register of a twin word register operand
+    {
+      const SparcSubtarget &Subtarget = MF->getSubtarget<SparcSubtarget>();
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      const SparcRegisterInfo *RegisterInfo = Subtarget.getRegisterInfo();
+      Register MOReg = MO.getReg();
+
+      Register HiReg, LoReg;
+      if (!SP::IntPairRegClass.contains(MOReg)) {
+        // If we aren't given a register pair already, find out which pair it
+        // belongs to. Note that here, the specified register operand, which
+        // refers to the high part of the twinword, needs to be an even-numbered
+        // register.
+        MOReg = RegisterInfo->getMatchingSuperReg(MOReg, SP::sub_even,
+                                                  &SP::IntPairRegClass);
+        if (!MOReg) {
+          SMLoc Loc;
+          OutContext.reportError(
+              Loc, "Hi part of pair should point to an even-numbered register");
+          OutContext.reportError(
+              Loc, "(note that in some cases it might be necessary to manually "
+                   "bind the input/output registers instead of relying on "
+                   "automatic allocation)");
+          return true;
+        }
+      }
+
+      HiReg = RegisterInfo->getSubReg(MOReg, SP::sub_even);
+      LoReg = RegisterInfo->getSubReg(MOReg, SP::sub_odd);
+
+      Register Reg;
+      switch (ExtraCode[0]) {
+      case 'L':
+        Reg = LoReg;
+        break;
+      case 'H':
+        Reg = HiReg;
+        break;
+      }
+
+      O << '%' << SparcInstPrinter::getRegisterName(Reg);
+      return false;
+    }
     case 'f':
     case 'r':
      break;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a9751e1..6f65344 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42725,6 +42725,8 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
   switch (Op.getOpcode()) {
   case X86ISD::PSHUFD:
   case X86ISD::VPERMILPI:
+  case X86ISD::UNPCKH:
+  case X86ISD::UNPCKL:
     return false;
   }
   return TargetLowering::canCreateUndefOrPoisonForTargetNode(
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index ce3b6af..270dd32 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -2161,6 +2161,11 @@ multiclass EFLAGSDefiningPats<string suffix, Predicate p> {
     def : Pat<(X86sub_flag_nocf GR16:$src, -1), (!cast<Instruction>(INC16r#suffix) GR16:$src)>;
     def : Pat<(X86sub_flag_nocf GR32:$src, -1), (!cast<Instruction>(INC32r#suffix) GR32:$src)>;
     def : Pat<(X86sub_flag_nocf GR64:$src, -1), (!cast<Instruction>(INC64r#suffix) GR64:$src)>;
+
+    def : Pat<(or_is_add GR8:$src, 1),   (!cast<Instruction>(INC8r#suffix) GR8:$src)>;
+    def : Pat<(or_is_add GR16:$src, 1),  (!cast<Instruction>(INC16r#suffix) GR16:$src)>;
+    def : Pat<(or_is_add GR32:$src, 1),  (!cast<Instruction>(INC32r#suffix) GR32:$src)>;
+    def : Pat<(or_is_add GR64:$src, 1),  (!cast<Instruction>(INC64r#suffix) GR64:$src)>;
   }
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index f243343..a5b2e48 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -6276,10 +6276,10 @@ static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget,
   case X86::RCPSSm:
   case X86::RCPSSr_Int:
   case X86::RCPSSm_Int:
-  case X86::ROUNDSDr:
-  case X86::ROUNDSDm:
-  case X86::ROUNDSSr:
-  case X86::ROUNDSSm:
+  case X86::ROUNDSDri:
+  case X86::ROUNDSDmi:
+  case X86::ROUNDSSri:
+  case X86::ROUNDSSmi:
   case X86::RSQRTSSr:
   case X86::RSQRTSSm:
   case X86::RSQRTSSr_Int:
@@ -6778,14 +6778,14 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
   case X86::VRCPSSr_Int:
   case X86::VRCPSSm:
   case X86::VRCPSSm_Int:
-  case X86::VROUNDSDr:
-  case X86::VROUNDSDm:
-  case X86::VROUNDSDr_Int:
-  case X86::VROUNDSDm_Int:
-  case X86::VROUNDSSr:
-  case X86::VROUNDSSm:
-  case X86::VROUNDSSr_Int:
-  case X86::VROUNDSSm_Int:
+  case X86::VROUNDSDri:
+  case X86::VROUNDSDmi:
+  case X86::VROUNDSDri_Int:
+  case X86::VROUNDSDmi_Int:
+  case X86::VROUNDSSri:
+  case X86::VROUNDSSmi:
+  case X86::VROUNDSSri_Int:
+  case X86::VROUNDSSmi_Int:
   case X86::VRSQRTSSr:
   case X86::VRSQRTSSr_Int:
   case X86::VRSQRTSSm:
@@ -7516,8 +7516,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VRCPSSr_Int:
     case X86::RSQRTSSr_Int:
     case X86::VRSQRTSSr_Int:
-    case X86::ROUNDSSr_Int:
-    case X86::VROUNDSSr_Int:
+    case X86::ROUNDSSri_Int:
+    case X86::VROUNDSSri_Int:
     case X86::COMISSrr_Int:
     case X86::VCOMISSrr_Int:
     case X86::VCOMISSZrr_Int:
@@ -7685,8 +7685,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VCVTSD2USI64Zrr_Int:
     case X86::VCVTTSD2USIZrr_Int:
     case X86::VCVTTSD2USI64Zrr_Int:
-    case X86::ROUNDSDr_Int:
-    case X86::VROUNDSDr_Int:
+    case X86::ROUNDSDri_Int:
+    case X86::VROUNDSDri_Int:
     case X86::COMISDrr_Int:
     case X86::VCOMISDrr_Int:
     case X86::VCOMISDZrr_Int:
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 69d4536..2b391b6 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -5475,35 +5475,35 @@ multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
   // Intrinsic operation, reg.
   // Vector intrinsic operation, reg
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
-  def r : SS4AIi8<opc, MRMSrcReg,
-                  (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
-                  !strconcat(OpcodeStr,
-                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                  [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
-                  Sched<[sched]>;
+  def ri : SS4AIi8<opc, MRMSrcReg,
+                   (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
+                   !strconcat(OpcodeStr,
+                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
+                   Sched<[sched]>;
 
   // Vector intrinsic operation, mem
-  def m : SS4AIi8<opc, MRMSrcMem,
-                  (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
-                  !strconcat(OpcodeStr,
-                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                  [(set RC:$dst,
-                        (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
-                  Sched<[sched.Folded]>;
+  def mi : SS4AIi8<opc, MRMSrcMem,
+                   (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
+                   !strconcat(OpcodeStr,
+                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set RC:$dst,
+                         (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
+                   Sched<[sched.Folded]>;
 }
 }
 
 multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
                           string OpcodeStr, X86FoldableSchedWrite sched> {
 let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
-  def SSr : SS4AIi8<opcss, MRMSrcReg,
+  def SSri : SS4AIi8<opcss, MRMSrcReg,
         (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
         !strconcat(OpcodeStr,
             "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
       []>, Sched<[sched]>;
 
   let mayLoad = 1 in
-  def SSm : SS4AIi8<opcss, MRMSrcMem,
+  def SSmi : SS4AIi8<opcss, MRMSrcMem,
         (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
         !strconcat(OpcodeStr,
              "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
@@ -5511,14 +5511,14 @@ let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
 
 let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
-  def SDr : SS4AIi8<opcsd, MRMSrcReg,
+  def SDri : SS4AIi8<opcsd, MRMSrcReg,
         (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
         !strconcat(OpcodeStr,
               "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
         []>, Sched<[sched]>;
 
   let mayLoad = 1 in
-  def SDm : SS4AIi8<opcsd, MRMSrcMem,
+  def SDmi : SS4AIi8<opcsd, MRMSrcMem,
         (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
         !strconcat(OpcodeStr,
              "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
@@ -5530,44 +5530,44 @@ multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
                            string OpcodeStr, X86FoldableSchedWrite sched> {
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
 let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
-  def SSr : SS4AIi8<opcss, MRMSrcReg,
-                    (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
-                    !strconcat(OpcodeStr,
-                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, Sched<[sched]>;
+  def SSri : SS4AIi8<opcss, MRMSrcReg,
+                     (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
+                     !strconcat(OpcodeStr,
+                                "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     []>, Sched<[sched]>;
 
   let mayLoad = 1 in
-  def SSm : SS4AIi8<opcss, MRMSrcMem,
-                    (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
-                    !strconcat(OpcodeStr,
-                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
+  def SSmi : SS4AIi8<opcss, MRMSrcMem,
+                     (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
+                     !strconcat(OpcodeStr,
+                                "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
 
 let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
-  def SDr : SS4AIi8<opcsd, MRMSrcReg,
-                    (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
-                    !strconcat(OpcodeStr,
-                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, Sched<[sched]>;
+  def SDri : SS4AIi8<opcsd, MRMSrcReg,
+                     (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
+                     !strconcat(OpcodeStr,
+                                "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     []>, Sched<[sched]>;
 
   let mayLoad = 1 in
-  def SDm : SS4AIi8<opcsd, MRMSrcMem,
-                    (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
-                    !strconcat(OpcodeStr,
-                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
+  def SDmi : SS4AIi8<opcsd, MRMSrcMem,
+                     (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
+                     !strconcat(OpcodeStr,
+                                "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
 }
 }
 
-multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
-                            string OpcodeStr, X86FoldableSchedWrite sched,
-                            ValueType VT32, ValueType VT64,
-                            SDNode OpNode, bit Is2Addr = 1> {
+multiclass sse41_fp_unop_s_int<bits<8> opcss, bits<8> opcsd,
+                               string OpcodeStr, X86FoldableSchedWrite sched,
+                               ValueType VT32, ValueType VT64,
+                               SDNode OpNode, bit Is2Addr = 1> {
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
 let ExeDomain = SSEPackedSingle in {
-  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
+  def SSri_Int : SS4AIi8<opcss, MRMSrcReg,
         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
@@ -5577,7 +5577,7 @@ let ExeDomain = SSEPackedSingle in {
         [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
         Sched<[sched]>;
 
-  def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
+  def SSmi_Int : SS4AIi8<opcss, MRMSrcMem,
         (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
@@ -5590,7 +5590,7 @@ let ExeDomain = SSEPackedSingle in {
 } // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
 
 let ExeDomain = SSEPackedDouble in {
-  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
+  def SDri_Int : SS4AIi8<opcsd, MRMSrcReg,
         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
@@ -5600,7 +5600,7 @@ let ExeDomain = SSEPackedDouble in {
         [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
         Sched<[sched]>;
 
-  def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
+  def SDmi_Int : SS4AIi8<opcsd, MRMSrcMem,
         (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
@@ -5636,25 +5636,25 @@ let Predicates = [HasAVX, NoVLX] in {
   }
 }
 let Predicates = [UseAVX] in {
-  defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
-                                  v4f32, v2f64, X86RndScales, 0>,
-                                  VEX, VVVV, VEX_LIG, WIG, SIMD_EXC;
+  defm VROUND  : sse41_fp_unop_s_int<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
+                                     v4f32, v2f64, X86RndScales, 0>,
+                                     VEX, VVVV, VEX_LIG, WIG, SIMD_EXC;
   defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
                                 VEX, VVVV, VEX_LIG, WIG, SIMD_EXC;
 }
 
 let Predicates = [UseAVX] in {
   def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
-            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
+            (VROUNDSSri (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
   def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
-            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
+            (VROUNDSDri (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
 }
 
 let Predicates = [UseAVX, OptForSize] in {
   def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
-            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
+            (VROUNDSSmi (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
   def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
-            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
+            (VROUNDSDmi (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
 }
 
 let ExeDomain = SSEPackedSingle in
@@ -5667,21 +5667,21 @@ defm ROUNDPD  : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
 defm ROUND  : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
 
 let Constraints = "$src1 = $dst" in
-defm ROUND  : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
-                               v4f32, v2f64, X86RndScales>;
+defm ROUND  : sse41_fp_unop_s_int<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
+                                  v4f32, v2f64, X86RndScales>;
 
 let Predicates = [UseSSE41] in {
   def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
-            (ROUNDSSr FR32:$src1, timm:$src2)>;
+            (ROUNDSSri FR32:$src1, timm:$src2)>;
   def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
-            (ROUNDSDr FR64:$src1, timm:$src2)>;
+            (ROUNDSDri FR64:$src1, timm:$src2)>;
 }
 
 let Predicates = [UseSSE41, OptForSize] in {
   def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
-            (ROUNDSSm addr:$src1, timm:$src2)>;
+            (ROUNDSSmi addr:$src1, timm:$src2)>;
   def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
-            (ROUNDSDm addr:$src1, timm:$src2)>;
+            (ROUNDSDmi addr:$src1, timm:$src2)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td
index 0027de8..63ac910 100644
--- a/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -324,14 +324,14 @@ defm : BWWriteResPair<WriteFMAX,   [BWPort01], 5, [1], 1, 5>; // Fused Multiply
 defm : BWWriteResPair<WriteFMAY,   [BWPort01], 5, [1], 1, 6>; // Fused Multiply Add (YMM/ZMM).
 defm : X86WriteResPairUnsupported<WriteFMAZ>;
 defm : BWWriteResPair<WriteDPPD,   [BWPort0,BWPort1,BWPort5],  9, [1,1,1], 3, 5>; // Floating point double dot product.
-defm : BWWriteResPair<WriteDPPS,   [BWPort0,BWPort1,BWPort5], 14, [2,1,1], 4, 5>; // Floating point single dot product.
-defm : BWWriteResPair<WriteDPPSY,  [BWPort0,BWPort1,BWPort5], 14, [2,1,1], 4, 6>; // Floating point single dot product (YMM).
+defm : X86WriteRes<WriteDPPS,      [BWPort0,BWPort1,BWPort5], 14, [2,1,1], 4>;
+defm : X86WriteRes<WriteDPPSY,     [BWPort0,BWPort1,BWPort5], 14, [2,1,1], 4>;
+defm : X86WriteRes<WriteDPPSLd,    [BWPort0,BWPort1,BWPort5,BWPort06,BWPort23], 19, [2,1,1,1,1], 6>;
+defm : X86WriteRes<WriteDPPSYLd,   [BWPort0,BWPort1,BWPort5,BWPort06,BWPort23], 20, [2,1,1,1,1], 6>;
 defm : BWWriteResPair<WriteFSign,     [BWPort5], 1>; // Floating point fabs/fchs.
-defm : X86WriteRes<WriteFRnd,            [BWPort23],  6, [1],   1>; // Floating point rounding.
-defm : X86WriteRes<WriteFRndY,           [BWPort23],  6, [1],   1>; // Floating point rounding (YMM/ZMM).
+defm : BWWriteResPair<WriteFRnd,      [BWPort1], 6, [2], 2, 5>; // Floating point rounding.
+defm : BWWriteResPair<WriteFRndY,     [BWPort1], 6, [2], 2, 6>; // Floating point rounding (YMM/ZMM).
 defm : X86WriteResPairUnsupported<WriteFRndZ>;
-defm : X86WriteRes<WriteFRndLd,  [BWPort1,BWPort23], 11, [2,1], 3>;
-defm : X86WriteRes<WriteFRndYLd, [BWPort1,BWPort23], 12, [2,1], 3>;
 defm : BWWriteResPair<WriteFLogic,    [BWPort5], 1, [1], 1, 5>; // Floating point and/or/xor logicals.
 defm : BWWriteResPair<WriteFLogicY,   [BWPort5], 1, [1], 1, 6>; // Floating point and/or/xor logicals (YMM/ZMM).
 defm : X86WriteResPairUnsupported<WriteFLogicZ>;
diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td
index a11b470..516dc62 100644
--- a/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -324,15 +324,14 @@ defm : HWWriteResPair<WriteFMAX,   [HWPort01], 5, [1], 1, 6>;
 defm : HWWriteResPair<WriteFMAY,   [HWPort01], 5, [1], 1, 7>;
 defm : HWWriteResPair<WriteFMAZ,   [HWPort01], 5, [1], 1, 7>; // Unsupported = 1
 defm : HWWriteResPair<WriteDPPD,   [HWPort0,HWPort1,HWPort5],  9, [1,1,1], 3, 6>;
-defm : HWWriteResPair<WriteDPPS,   [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 6>;
-defm : HWWriteResPair<WriteDPPSY,  [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 7>;
+defm : X86WriteRes<WriteDPPS,      [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4>;
+defm : X86WriteRes<WriteDPPSY,     [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4>;
+defm : X86WriteRes<WriteDPPSLd,    [HWPort0,HWPort1,HWPort5,HWPort06,HWPort23], 20, [2,1,1,1,1], 6>;
+defm : X86WriteRes<WriteDPPSYLd,   [HWPort0,HWPort1,HWPort5,HWPort06,HWPort23], 21, [2,1,1,1,1], 6>;
 defm : HWWriteResPair<WriteFSign,  [HWPort0], 1>;
-defm : X86WriteRes<WriteFRnd,            [HWPort23],  6, [1],   1>;
-defm : X86WriteRes<WriteFRndY,           [HWPort23],  6, [1],   1>;
-defm : X86WriteRes<WriteFRndZ,           [HWPort23],  6, [1],   1>; // Unsupported = 1
-defm : X86WriteRes<WriteFRndLd,  [HWPort1,HWPort23], 12, [2,1], 3>;
-defm : X86WriteRes<WriteFRndYLd, [HWPort1,HWPort23], 13, [2,1], 3>;
-defm : X86WriteRes<WriteFRndZLd, [HWPort1,HWPort23], 13, [2,1], 3>; // Unsupported = 1
+defm : HWWriteResPair<WriteFRnd,   [HWPort1], 6, [2], 2, 6>;
+defm : HWWriteResPair<WriteFRndY,  [HWPort1], 6, [2], 2, 7>;
+defm : HWWriteResPair<WriteFRndZ,  [HWPort1], 6, [2], 2, 7>; // Unsupported = 1
 defm : HWWriteResPair<WriteFLogic,  [HWPort5], 1, [1], 1, 6>;
 defm : HWWriteResPair<WriteFLogicY, [HWPort5], 1, [1], 1, 7>;
 defm : HWWriteResPair<WriteFLogicZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1
diff --git a/llvm/lib/Target/X86/X86SchedSapphireRapids.td b/llvm/lib/Target/X86/X86SchedSapphireRapids.td
index 88bb9ad..ff3fe32 100644
--- a/llvm/lib/Target/X86/X86SchedSapphireRapids.td
+++ b/llvm/lib/Target/X86/X86SchedSapphireRapids.td
@@ -2290,8 +2290,8 @@ def SPRWriteResGroup218 : SchedWriteRes<[SPRPort00_01, SPRPort02_03_11]> {
   let Latency = 15;
   let NumMicroOps = 3;
 }
-def : InstRW<[SPRWriteResGroup218], (instregex "^(V?)ROUNDP(D|S)m$")>;
-def : InstRW<[SPRWriteResGroup218, ReadAfterVecXLd], (instregex "^(V?)ROUNDS(D|S)m((_Int)?)$",
+def : InstRW<[SPRWriteResGroup218], (instregex "^(V?)ROUNDP(D|S)mi$")>;
+def : InstRW<[SPRWriteResGroup218, ReadAfterVecXLd], (instregex "^(V?)ROUNDS(D|S)mi((_Int)?)$",
                                                                 "^VRNDSCALEP(D|S)Z128rm(bi|ik)$",
                                                                 "^VRNDSCALEP(D|S)Z128rmbik(z?)$",
                                                                 "^VRNDSCALEP(D|S)Z128rmi((kz)?)$",
@@ -2303,13 +2303,13 @@ def SPRWriteResGroup219 : SchedWriteRes<[SPRPort00_01]> {
   let Latency = 8;
   let NumMicroOps = 2;
 }
-def : InstRW<[SPRWriteResGroup219], (instregex "^(V?)ROUND(PD|SS)r$",
-                                               "^(V?)ROUND(PS|SD)r$",
-                                               "^(V?)ROUNDS(D|S)r_Int$",
+def : InstRW<[SPRWriteResGroup219], (instregex "^(V?)ROUND(PD|SS)ri$",
+                                               "^(V?)ROUND(PS|SD)ri$",
+                                               "^(V?)ROUNDS(D|S)ri_Int$",
                                                "^VRNDSCALEP(D|S)Z(128|256)rri((k|kz)?)$",
                                                "^VRNDSCALES(D|S)Zr$",
                                                "^VRNDSCALES(D|S)Zr(b?)_Int((k|kz)?)$",
-                                               "^VROUNDP(D|S)Yr$")>;
+                                               "^VROUNDP(D|S)Yri$")>;
 
 def SPRWriteResGroup220 : SchedWriteRes<[SPRPort00_06]> {
   let ReleaseAtCycles = [2];
@@ -3737,7 +3737,7 @@ def SPRWriteResGroup390 : SchedWriteRes<[SPRPort00_01, SPRPort02_03_11]> {
   let NumMicroOps = 3;
 }
 def : InstRW<[SPRWriteResGroup390], (instregex "^VF(C?)MADDCPHZ(128|256)m(b?)$",
-                                               "^VROUNDP(D|S)Ym$")>;
+                                               "^VROUNDP(D|S)Ymi$")>;
 def : InstRW<[SPRWriteResGroup390, ReadAfterVecXLd], (instregex "^VF(C?)MADDCSHZm$",
                                                                 "^VF(C?)MULCPHZ128rm(b?)$",
                                                                 "^VF(C?)MULCSHZrm$",
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 4fa138f..3ee931f 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -311,8 +311,10 @@ defm : SKLWriteResPair<WriteFMAX,   [SKLPort01], 4, [1], 1, 6>;
 defm : SKLWriteResPair<WriteFMAY,   [SKLPort01], 4, [1], 1, 7>;
 defm : X86WriteResPairUnsupported<WriteFMAZ>;
 defm : SKLWriteResPair<WriteDPPD,   [SKLPort5,SKLPort01],  9, [1,2], 3, 6>; // Floating point double dot product.
-defm : SKLWriteResPair<WriteDPPS,   [SKLPort5,SKLPort01], 13, [1,3], 4, 6>;
-defm : SKLWriteResPair<WriteDPPSY,  [SKLPort5,SKLPort01], 13, [1,3], 4, 7>;
+defm : X86WriteRes<WriteDPPS,       [SKLPort5,SKLPort01], 13, [1,3], 4>;
+defm : X86WriteRes<WriteDPPSY,      [SKLPort5,SKLPort01], 13, [1,3], 4>;
+defm : X86WriteRes<WriteDPPSLd,     [SKLPort5,SKLPort01,SKLPort06,SKLPort23], 19, [1,3,1,1], 6>;
+defm : X86WriteRes<WriteDPPSYLd,    [SKLPort5,SKLPort01,SKLPort06,SKLPort23], 20, [1,3,1,1], 6>;
 defm : SKLWriteResPair<WriteFSign,   [SKLPort0], 1>; // Floating point fabs/fchs.
 defm : SKLWriteResPair<WriteFRnd,     [SKLPort01], 8, [2], 2, 6>; // Floating point rounding.
 defm : SKLWriteResPair<WriteFRndY,    [SKLPort01], 8, [2], 2, 7>;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index 3da688c..a7dff0e 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -311,8 +311,10 @@ defm : SKXWriteResPair<WriteFMAX, [SKXPort01],  4, [1], 1, 6>;
 defm : SKXWriteResPair<WriteFMAY, [SKXPort01],  4, [1], 1, 7>;
 defm : SKXWriteResPair<WriteFMAZ, [SKXPort05],  4, [1], 1, 7>;
 defm : SKXWriteResPair<WriteDPPD, [SKXPort5,SKXPort015],  9, [1,2], 3, 6>; // Floating point double dot product.
-defm : SKXWriteResPair<WriteDPPS, [SKXPort5,SKXPort015], 13, [1,3], 4, 6>;
-defm : SKXWriteResPair<WriteDPPSY,[SKXPort5,SKXPort015], 13, [1,3], 4, 7>;
+defm : X86WriteRes<WriteDPPS,       [SKXPort5,SKXPort01], 13, [1,3], 4>;
+defm : X86WriteRes<WriteDPPSY,      [SKXPort5,SKXPort01], 13, [1,3], 4>;
+defm : X86WriteRes<WriteDPPSLd,     [SKXPort5,SKXPort01,SKXPort06,SKXPort23], 19, [1,3,1,1], 6>;
+defm : X86WriteRes<WriteDPPSYLd,    [SKXPort5,SKXPort01,SKXPort06,SKXPort23], 20, [1,3,1,1], 6>;
 defm : SKXWriteResPair<WriteFSign,  [SKXPort0],  1>; // Floating point fabs/fchs.
 defm : SKXWriteResPair<WriteFRnd,   [SKXPort01], 8, [2], 2, 6>; // Floating point rounding.
 defm : SKXWriteResPair<WriteFRndY,  [SKXPort01], 8, [2], 2, 7>;
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td
index d90c8bd..2e87d52 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver3.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td
@@ -52,7 +52,7 @@ def Znver3Model : SchedMachineModel {
   int VecLoadLatency = 7;
   // Latency of a simple store operation.
   int StoreLatency = 1;
-  // FIXME
+  // FIXME:
   let HighLatency = 25; // FIXME: any better choice?
   // AMD SOG 19h, 2.8 Optimizing Branching
   // The branch misprediction penalty is in the range from 11 to 18 cycles,
@@ -193,11 +193,11 @@ def Zn3Int : ProcResGroup<[Zn3ALU0, Zn3AGU0, Zn3BRU0, // scheduler 0
 // <...>, and six FPU pipes.
 // Agner, 22.10 Floating point execution pipes
 // There are six floating point/vector execution pipes,
-def Zn3FPP0  : ProcResource<1>;
-def Zn3FPP1  : ProcResource<1>;
-def Zn3FPP2  : ProcResource<1>;
-def Zn3FPP3  : ProcResource<1>;
-def Zn3FPP45 : ProcResource<2>;
+def Zn3FP0  : ProcResource<1>;
+def Zn3FP1  : ProcResource<1>;
+def Zn3FP2  : ProcResource<1>;
+def Zn3FP3  : ProcResource<1>;
+def Zn3FP45 : ProcResource<2>;
 
 //
 // Execution Units
@@ -205,63 +205,63 @@ def Zn3FPP45 : ProcResource<2>;
 // AMD SOG 19h, 2.11.1 Floating Point Execution Resources
 
 // (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
-defvar Zn3FPFMul0 = Zn3FPP0;
-defvar Zn3FPFMul1 = Zn3FPP1;
+defvar Zn3FPFMul0 = Zn3FP0;
+defvar Zn3FPFMul1 = Zn3FP1;
 
 // (v)FADD*
-defvar Zn3FPFAdd0 = Zn3FPP2;
-defvar Zn3FPFAdd1 = Zn3FPP3;
+defvar Zn3FPFAdd0 = Zn3FP2;
+defvar Zn3FPFAdd1 = Zn3FP3;
 
 // All convert operations except pack/unpack
-defvar Zn3FPFCvt0 = Zn3FPP2;
-defvar Zn3FPFCvt1 = Zn3FPP3;
+defvar Zn3FPFCvt0 = Zn3FP2;
+defvar Zn3FPFCvt1 = Zn3FP3;
 
 // All Divide and Square Root except Reciprocal Approximation
 // AMD SOG 19h, 2.11.1 Floating Point Execution Resources
 // FDIV unit can support 2 simultaneous operations in flight
 // even though it occupies a single pipe.
 // FIXME: BufferSize=2 ?
-defvar Zn3FPFDiv = Zn3FPP1;
+defvar Zn3FPFDiv = Zn3FP1;
 
 // Moves and Logical operations on Floating Point Data Types
-defvar Zn3FPFMisc0 = Zn3FPP0;
-defvar Zn3FPFMisc1 = Zn3FPP1;
-defvar Zn3FPFMisc2 = Zn3FPP2;
-defvar Zn3FPFMisc3 = Zn3FPP3;
+defvar Zn3FPFMisc0 = Zn3FP0;
+defvar Zn3FPFMisc1 = Zn3FP1;
+defvar Zn3FPFMisc2 = Zn3FP2;
+defvar Zn3FPFMisc3 = Zn3FP3;
 
 // Integer Adds, Subtracts, and Compares
 // Some complex VADD operations are not available in all pipes.
-defvar Zn3FPVAdd0 = Zn3FPP0;
-defvar Zn3FPVAdd1 = Zn3FPP1;
-defvar Zn3FPVAdd2 = Zn3FPP2;
-defvar Zn3FPVAdd3 = Zn3FPP3;
+defvar Zn3FPVAdd0 = Zn3FP0;
+defvar Zn3FPVAdd1 = Zn3FP1;
+defvar Zn3FPVAdd2 = Zn3FP2;
+defvar Zn3FPVAdd3 = Zn3FP3;
 
 // Integer Multiplies, SAD, Blendvb
-defvar Zn3FPVMul0 = Zn3FPP0;
-defvar Zn3FPVMul1 = Zn3FPP3;
+defvar Zn3FPVMul0 = Zn3FP0;
+defvar Zn3FPVMul1 = Zn3FP3;
 
 // Data Shuffles, Packs, Unpacks, Permute
 // Some complex shuffle operations are only available in pipe1.
-defvar Zn3FPVShuf = Zn3FPP1;
-defvar Zn3FPVShufAux = Zn3FPP2;
+defvar Zn3FPVShuf = Zn3FP1;
+defvar Zn3FPVShufAux = Zn3FP2;
 
 // Bit Shift Left/Right operations
-defvar Zn3FPVShift0 = Zn3FPP1;
-defvar Zn3FPVShift1 = Zn3FPP2;
+defvar Zn3FPVShift0 = Zn3FP1;
+defvar Zn3FPVShift1 = Zn3FP2;
 
 // Moves and Logical operations on Packed Integer Data Types
-defvar Zn3FPVMisc0 = Zn3FPP0;
-defvar Zn3FPVMisc1 = Zn3FPP1;
-defvar Zn3FPVMisc2 = Zn3FPP2;
-defvar Zn3FPVMisc3 = Zn3FPP3;
+defvar Zn3FPVMisc0 = Zn3FP0;
+defvar Zn3FPVMisc1 = Zn3FP1;
+defvar Zn3FPVMisc2 = Zn3FP2;
+defvar Zn3FPVMisc3 = Zn3FP3;
 
 // *AES*
-defvar Zn3FPAES0 = Zn3FPP0;
-defvar Zn3FPAES1 = Zn3FPP1;
+defvar Zn3FPAES0 = Zn3FP0;
+defvar Zn3FPAES1 = Zn3FP1;
 
 // *CLM*
-defvar Zn3FPCLM0 = Zn3FPP0;
-defvar Zn3FPCLM1 = Zn3FPP1;
+defvar Zn3FPCLM0 = Zn3FP0;
+defvar Zn3FPCLM1 = Zn3FP1;
 
 // Execution pipeline grouping
 //===----------------------------------------------------------------------===//
@@ -269,7 +269,7 @@ defvar Zn3FPCLM1 = Zn3FPP1;
 // AMD SOG 19h, 2.11 Floating-Point Unit
 // Stores and floating point to general purpose register transfer
 // have 2 dedicated pipelines (pipe 5 and 6).
-def Zn3FPU0123 : ProcResGroup<[Zn3FPP0, Zn3FPP1, Zn3FPP2, Zn3FPP3]>;
+def Zn3FPU0123 : ProcResGroup<[Zn3FP0, Zn3FP1, Zn3FP2, Zn3FP3]>;
 
 // (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
 def Zn3FPFMul01 : ProcResGroup<[Zn3FPFMul0, Zn3FPFMul1]>;
@@ -293,12 +293,12 @@ def Zn3FPFMisc12 : ProcResGroup<[Zn3FPFMisc1, Zn3FPFMisc2]>;
 // AMD SOG 19h, 2.11 Floating-Point Unit
 // Stores and floating point to general purpose register transfer
 // have 2 dedicated pipelines (pipe 5 and 6).
-defvar Zn3FPLd01 = Zn3FPP45;
+defvar Zn3FPLd01 = Zn3FP45;
 
 // AMD SOG 19h, 2.11 Floating-Point Unit
 // Note that FP stores are supported on two pipelines,
 // but throughput is limited to one per cycle.
-let Super = Zn3FPP45 in
+let Super = Zn3FP45 in
 def Zn3FPSt : ProcResource<1>;
 
 // Integer Adds, Subtracts, and Compares
@@ -345,8 +345,8 @@ def Zn3FpPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 1], [0, 1, 1],
 // AMD SOG 19h, 2.11 Floating-Point Unit
 // <...> the scheduler can issue 1 micro op per cycle for each pipe.
 // FIXME: those are two separate schedulers, not a single big one.
-def Zn3FP : ProcResGroup<[Zn3FPP0, Zn3FPP2,          /*Zn3FPP4,*/ // scheduler 0
-                          Zn3FPP1, Zn3FPP3, Zn3FPP45 /*Zn3FPP5*/  // scheduler 1
+def Zn3FP : ProcResGroup<[Zn3FP0, Zn3FP2,          /*Zn3FP4,*/ // scheduler 0
+                          Zn3FP1, Zn3FP3, Zn3FP45 /*Zn3FP5*/  // scheduler 1
                          ]> {
   let BufferSize = !mul(2, 32);
 }
@@ -838,9 +838,9 @@ defm : Zn3WriteResInt<WriteZero, [Zn3ALU0123], 0, [0], 1>;
 defm : Zn3WriteResIntPair<WriteJump, [Zn3BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis
 
 // Floating point. This covers both scalar and vector operations.
-defm : Zn3WriteResInt<WriteFLD0, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 4), [1, 1, 1], 1>;
-defm : Zn3WriteResInt<WriteFLD1, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
-defm : Zn3WriteResInt<WriteFLDC, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
+defm : Zn3WriteResInt<WriteFLD0, [Zn3FPLd01, Zn3Load, Zn3FP1], !add(Znver3Model.LoadLatency, 4), [1, 1, 1], 1>;
+defm : Zn3WriteResInt<WriteFLD1, [Zn3FPLd01, Zn3Load, Zn3FP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
+defm : Zn3WriteResInt<WriteFLDC, [Zn3FPLd01, Zn3Load, Zn3FP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
 defm : Zn3WriteResXMM<WriteFLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
 defm : Zn3WriteResXMM<WriteFLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
 defm : Zn3WriteResYMM<WriteFLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 276bc7f..86b4560 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -211,8 +211,9 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT, bool JIT,
 }
 
 static CodeModel::Model
-getEffectiveX86CodeModel(std::optional<CodeModel::Model> CM, bool JIT,
-                         bool Is64Bit) {
+getEffectiveX86CodeModel(const Triple &TT, std::optional<CodeModel::Model> CM,
+                         bool JIT) {
+  bool Is64Bit = TT.getArch() == Triple::x86_64;
   if (CM) {
     if (*CM == CodeModel::Tiny)
       report_fatal_error("Target does not support the tiny CodeModel", false);
@@ -234,7 +235,7 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
     : LLVMTargetMachine(
           T, computeDataLayout(TT), TT, CPU, FS, Options,
           getEffectiveRelocModel(TT, JIT, RM),
-          getEffectiveX86CodeModel(CM, JIT, TT.getArch() == Triple::x86_64),
+          getEffectiveX86CodeModel(TT, CM, JIT),
           OL),
       TLOF(createTLOF(getTargetTriple())), IsJIT(JIT) {
   // On PS4/PS5, the "return address" of a 'noreturn' call must still be within
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 2ec2946..cd61029 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2664,9 +2664,9 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   };
 
   static const TypeConversionCostTblEntry AVXConversionTbl[] = {
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   6 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   4 },
     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   4 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   7 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   4 },
     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   4 },
     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  4 },
     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  4 },
diff --git a/llvm/lib/TextAPI/InterfaceFile.cpp b/llvm/lib/TextAPI/InterfaceFile.cpp
index 9979df92..79694c9 100644
--- a/llvm/lib/TextAPI/InterfaceFile.cpp
+++ b/llvm/lib/TextAPI/InterfaceFile.cpp
@@ -54,7 +54,7 @@ void InterfaceFile::addParentUmbrella(const Target &Target_, StringRef Parent) {
   ParentUmbrellas.emplace(Iter, Target_, std::string(Parent));
 }
 
-void InterfaceFile::addRPath(const Target &InputTarget, StringRef RPath) {
+void InterfaceFile::addRPath(StringRef RPath, const Target &InputTarget) {
   if (RPath.empty())
     return;
   using RPathEntryT = const std::pair<Target, std::string>;
@@ -198,9 +198,9 @@ InterfaceFile::merge(const InterfaceFile *O) const {
       IF->addReexportedLibrary(Lib.getInstallName(), Target);
 
   for (const auto &[Target, Path] : rpaths())
-    IF->addRPath(Target, Path);
+    IF->addRPath(Path, Target);
   for (const auto &[Target, Path] : O->rpaths())
-    IF->addRPath(Target, Path);
+    IF->addRPath(Path, Target);
 
   for (const auto *Sym : symbols()) {
     IF->addSymbol(Sym->getKind(), Sym->getName(), Sym->targets(),
@@ -319,7 +319,7 @@ InterfaceFile::extract(Architecture Arch) const {
 
   for (const auto &It : rpaths())
     if (It.first.Arch == Arch)
-      IF->addRPath(It.first, It.second);
+      IF->addRPath(It.second, It.first);
 
   for (const auto &Lib : allowableClients())
     for (const auto &Target : Lib.targets())
diff --git a/llvm/lib/TextAPI/TextStubV5.cpp b/llvm/lib/TextAPI/TextStubV5.cpp
index d969810..b072c0b 100644
--- a/llvm/lib/TextAPI/TextStubV5.cpp
+++ b/llvm/lib/TextAPI/TextStubV5.cpp
@@ -672,7 +672,7 @@ Expected<IFPtr> parseToInterfaceFile(const Object *File) {
       F->addParentUmbrella(Target, Lib);
   for (auto &[Path, Targets] : RPaths)
     for (auto Target : Targets)
-      F->addRPath(Target, Path);
+      F->addRPath(Path, Target);
   for (auto &[Targets, Symbols] : Exports)
     for (auto &Sym : Symbols)
       F->addSymbol(Sym.Kind, Sym.Name, Targets, Sym.Flags);
diff --git a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
index bb46539..1ca89e0 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
@@ -247,10 +247,10 @@ void SampleProfileMatcher::runOnFunction(Function &F) {
   if (ReportProfileStaleness || PersistProfileStaleness)
     recordCallsiteMatchStates(F, IRAnchors, ProfileAnchors, nullptr);
 
-  // Run profile matching for checksum mismatched profile, currently only
-  // support for pseudo-probe.
-  if (SalvageStaleProfile && FunctionSamples::ProfileIsProbeBased &&
-      !ProbeManager->profileIsValid(F, *FSFlattened)) {
+  // For probe-based profiles, run matching only when the current profile is not
+  // valid.
+  if (SalvageStaleProfile && (!FunctionSamples::ProfileIsProbeBased ||
+                              !ProbeManager->profileIsValid(F, *FSFlattened))) {
     // For imported functions, the checksum metadata(pseudo_probe_desc) are
     // dropped, so we leverage function attribute(profile-checksum-mismatch) to
     // transfer the info: add the attribute during pre-link phase and check it
diff --git a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
index 4d0fa24..9a191b0 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
@@ -178,8 +178,7 @@ SampleProfileProber::SampleProfileProber(Function &Func,
   DenseSet<BasicBlock *> BlocksAndCallsToIgnore;
   computeBlocksToIgnore(BlocksToIgnore, BlocksAndCallsToIgnore);
 
-  computeProbeIdForBlocks(BlocksToIgnore);
-  computeProbeIdForCallsites(BlocksAndCallsToIgnore);
+  computeProbeId(BlocksToIgnore, BlocksAndCallsToIgnore);
   computeCFGHash(BlocksToIgnore);
 }
 
@@ -300,27 +299,20 @@ void SampleProfileProber::computeCFGHash(
                     << ", Hash = " << FunctionHash << "\n");
 }
 
-void SampleProfileProber::computeProbeIdForBlocks(
-    const DenseSet<BasicBlock *> &BlocksToIgnore) {
-  for (auto &BB : *F) {
-    if (BlocksToIgnore.contains(&BB))
-      continue;
-    BlockProbeIds[&BB] = ++LastProbeId;
-  }
-}
-
-void SampleProfileProber::computeProbeIdForCallsites(
+void SampleProfileProber::computeProbeId(
+    const DenseSet<BasicBlock *> &BlocksToIgnore,
     const DenseSet<BasicBlock *> &BlocksAndCallsToIgnore) {
   LLVMContext &Ctx = F->getContext();
   Module *M = F->getParent();
 
   for (auto &BB : *F) {
+    if (!BlocksToIgnore.contains(&BB))
+      BlockProbeIds[&BB] = ++LastProbeId;
+
     if (BlocksAndCallsToIgnore.contains(&BB))
       continue;
     for (auto &I : BB) {
-      if (!isa<CallBase>(I))
-        continue;
-      if (isa<IntrinsicInst>(&I))
+      if (!isa<CallBase>(I) || isa<IntrinsicInst>(&I))
         continue;
 
       // The current implementation uses the lower 16 bits of the discriminator
diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 3986359..4df18c8 100644
--- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -583,10 +583,8 @@ llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
 
   // RemoveDIs: there's no bitcode representation of the DbgVariableRecord
   // debug-info, convert to dbg.values before writing out.
-  bool ConvertToOldDbgFormatForWrite =
-      M.IsNewDbgInfoFormat && !WriteNewDbgInfoFormatToBitcode;
-  if (ConvertToOldDbgFormatForWrite)
-    M.convertFromNewDbgValues();
+  ScopedDbgInfoFormatSetter FormatSetter(M, M.IsNewDbgInfoFormat &&
+                                                WriteNewDbgInfoFormatToBitcode);
 
   bool Changed = writeThinLTOBitcode(
       OS, ThinLinkOS,
@@ -595,8 +593,5 @@ llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
       },
       M, &AM.getResult<ModuleSummaryIndexAnalysis>(M));
 
-  if (ConvertToOldDbgFormatForWrite)
-    M.convertToNewDbgValues();
-
   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 9ab2bd8..4d3de76 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1687,6 +1687,109 @@ static Value *foldSelectInstWithICmpConst(SelectInst &SI, ICmpInst *ICI,
   return nullptr;
 }
 
+static Instruction *foldSelectICmpEq(SelectInst &SI, ICmpInst *ICI,
+                                     InstCombinerImpl &IC) {
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  if (!ICmpInst::isEquality(Pred))
+    return nullptr;
+
+  Value *TrueVal = SI.getTrueValue();
+  Value *FalseVal = SI.getFalseValue();
+  Value *CmpLHS = ICI->getOperand(0);
+  Value *CmpRHS = ICI->getOperand(1);
+
+  if (Pred == ICmpInst::ICMP_NE)
+    std::swap(TrueVal, FalseVal);
+
+  // Transform (X == C) ? X : Y -> (X == C) ? C : Y
+  // specific handling for Bitwise operation.
+  // x&y -> (x|y) ^ (x^y)  or  (x|y) & ~(x^y)
+  // x|y -> (x&y) | (x^y)  or  (x&y) ^  (x^y)
+  // x^y -> (x|y) ^ (x&y)  or  (x|y) & ~(x&y)
+  Value *X, *Y;
+  if (!match(CmpLHS, m_BitwiseLogic(m_Value(X), m_Value(Y))) ||
+      !match(TrueVal, m_c_BitwiseLogic(m_Specific(X), m_Specific(Y))))
+    return nullptr;
+
+  const unsigned AndOps = Instruction::And, OrOps = Instruction::Or,
+                 XorOps = Instruction::Xor, NoOps = 0;
+  enum NotMask { None = 0, NotInner, NotRHS };
+
+  auto matchFalseVal = [&](unsigned OuterOpc, unsigned InnerOpc,
+                           unsigned NotMask) {
+    auto matchInner = m_c_BinOp(InnerOpc, m_Specific(X), m_Specific(Y));
+    if (OuterOpc == NoOps)
+      return match(CmpRHS, m_Zero()) && match(FalseVal, matchInner);
+
+    if (NotMask == NotInner) {
+      return match(FalseVal,
+                   m_c_BinOp(OuterOpc, m_Not(matchInner), m_Specific(CmpRHS)));
+    } else if (NotMask == NotRHS) {
+      return match(FalseVal,
+                   m_c_BinOp(OuterOpc, matchInner, m_Not(m_Specific(CmpRHS))));
+    } else {
+      return match(FalseVal,
+                   m_c_BinOp(OuterOpc, matchInner, m_Specific(CmpRHS)));
+    }
+  };
+
+  // (X&Y)==C ? X|Y : X^Y -> (X^Y)|C : X^Y  or (X^Y)^ C : X^Y
+  // (X&Y)==C ? X^Y : X|Y -> (X|Y)^C : X|Y  or (X|Y)&~C : X|Y
+  if (match(CmpLHS, m_And(m_Value(X), m_Value(Y)))) {
+    if (match(TrueVal, m_c_Or(m_Specific(X), m_Specific(Y)))) {
+      // (X&Y)==C ? X|Y : (X^Y)|C -> (X^Y)|C : (X^Y)|C -> (X^Y)|C
+      // (X&Y)==C ? X|Y : (X^Y)^C -> (X^Y)^C : (X^Y)^C -> (X^Y)^C
+      if (matchFalseVal(OrOps, XorOps, None) ||
+          matchFalseVal(XorOps, XorOps, None))
+        return IC.replaceInstUsesWith(SI, FalseVal);
+    } else if (match(TrueVal, m_c_Xor(m_Specific(X), m_Specific(Y)))) {
+      // (X&Y)==C ? X^Y : (X|Y)^ C -> (X|Y)^ C : (X|Y)^ C -> (X|Y)^ C
+      // (X&Y)==C ? X^Y : (X|Y)&~C -> (X|Y)&~C : (X|Y)&~C -> (X|Y)&~C
+      if (matchFalseVal(XorOps, OrOps, None) ||
+          matchFalseVal(AndOps, OrOps, NotRHS))
+        return IC.replaceInstUsesWith(SI, FalseVal);
+    }
+  }
+
+  // (X|Y)==C ? X&Y : X^Y -> (X^Y)^C : X^Y  or  ~(X^Y)&C : X^Y
+  // (X|Y)==C ? X^Y : X&Y -> (X&Y)^C : X&Y  or  ~(X&Y)&C : X&Y
+  if (match(CmpLHS, m_Or(m_Value(X), m_Value(Y)))) {
+    if (match(TrueVal, m_c_And(m_Specific(X), m_Specific(Y)))) {
+      // (X|Y)==C ? X&Y: (X^Y)^C -> (X^Y)^C: (X^Y)^C ->  (X^Y)^C
+      // (X|Y)==C ? X&Y:~(X^Y)&C ->~(X^Y)&C:~(X^Y)&C -> ~(X^Y)&C
+      if (matchFalseVal(XorOps, XorOps, None) ||
+          matchFalseVal(AndOps, XorOps, NotInner))
+        return IC.replaceInstUsesWith(SI, FalseVal);
+    } else if (match(TrueVal, m_c_Xor(m_Specific(X), m_Specific(Y)))) {
+      // (X|Y)==C ? X^Y : (X&Y)^C ->  (X&Y)^C : (X&Y)^C ->  (X&Y)^C
+      // (X|Y)==C ? X^Y :~(X&Y)&C -> ~(X&Y)&C :~(X&Y)&C -> ~(X&Y)&C
+      if (matchFalseVal(XorOps, AndOps, None) ||
+          matchFalseVal(AndOps, AndOps, NotInner))
+        return IC.replaceInstUsesWith(SI, FalseVal);
+    }
+  }
+
+  // (X^Y)==C ? X&Y : X|Y -> (X|Y)^C : X|Y  or (X|Y)&~C : X|Y
+  // (X^Y)==C ? X|Y : X&Y -> (X&Y)|C : X&Y  or (X&Y)^ C : X&Y
+  if (match(CmpLHS, m_Xor(m_Value(X), m_Value(Y)))) {
+    if ((match(TrueVal, m_c_And(m_Specific(X), m_Specific(Y))))) {
+      // (X^Y)==C ? X&Y : (X|Y)^C -> (X|Y)^C
+      // (X^Y)==C ? X&Y : (X|Y)&~C -> (X|Y)&~C
+      if (matchFalseVal(XorOps, OrOps, None) ||
+          matchFalseVal(AndOps, OrOps, NotRHS))
+        return IC.replaceInstUsesWith(SI, FalseVal);
+    } else if (match(TrueVal, m_c_Or(m_Specific(X), m_Specific(Y)))) {
+      // (X^Y)==C ? (X|Y) : (X&Y)|C -> (X&Y)|C
+      // (X^Y)==C ? (X|Y) : (X&Y)^C -> (X&Y)^C
+      if (matchFalseVal(OrOps, AndOps, None) ||
+          matchFalseVal(XorOps, AndOps, None))
+        return IC.replaceInstUsesWith(SI, FalseVal);
+    }
+  }
+
+  return nullptr;
+}
+
 /// Visit a SelectInst that has an ICmpInst as its first operand.
 Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
                                                       ICmpInst *ICI) {
@@ -1729,6 +1832,9 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
     }
   }
 
+  if (Instruction *NewSel = foldSelectICmpEq(SI, ICI, *this))
+    return NewSel;
+
   // Canonicalize a signbit condition to use zero constant by swapping:
   // (CmpLHS > -1) ? TV : FV --> (CmpLHS < 0) ? FV : TV
   // To avoid conflicts (infinite loops) with other canonicalizations, this is
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index d0d349c..ad1cd9c 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -182,18 +182,11 @@ static cl::opt<bool> ClWithTls(
              "platforms that support this"),
     cl::Hidden, cl::init(true));
 
-static cl::opt<bool>
-    CSelectiveInstrumentation("hwasan-selective-instrumentation",
-                              cl::desc("Use selective instrumentation"),
-                              cl::Hidden, cl::init(false));
-
-static cl::opt<int> ClHotPercentileCutoff(
-    "hwasan-percentile-cutoff-hot", cl::init(0),
-    cl::desc("Alternative hot percentile cuttoff."
-             "By default `-profile-summary-cutoff-hot` is used."));
+static cl::opt<int> ClHotPercentileCutoff("hwasan-percentile-cutoff-hot",
+                                          cl::desc("Hot percentile cuttoff."));
 
 static cl::opt<float>
-    ClRandomSkipRate("hwasan-random-skip-rate", cl::init(0),
+    ClRandomSkipRate("hwasan-random-skip-rate",
                      cl::desc("Probability value in the range [0.0, 1.0] "
                               "to skip instrumentation of a function."));
 
@@ -317,7 +310,7 @@ private:
   };
 
   bool selectiveInstrumentationShouldSkip(Function &F,
-                                          FunctionAnalysisManager &FAM);
+                                          FunctionAnalysisManager &FAM) const;
   void initializeModule();
   void createHwasanCtorComdat();
 
@@ -1500,28 +1493,22 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
 }
 
 bool HWAddressSanitizer::selectiveInstrumentationShouldSkip(
-    Function &F, FunctionAnalysisManager &FAM) {
+    Function &F, FunctionAnalysisManager &FAM) const {
   if (ClRandomSkipRate.getNumOccurrences()) {
     std::bernoulli_distribution D(ClRandomSkipRate);
-    if (D(*Rng))
-      return true;
-  } else {
-    auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
-    ProfileSummaryInfo *PSI =
-        MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
-    if (PSI && PSI->hasProfileSummary()) {
-      auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
-      if ((ClHotPercentileCutoff.getNumOccurrences() &&
-           ClHotPercentileCutoff >= 0)
-              ? PSI->isFunctionHotInCallGraphNthPercentile(
-                    ClHotPercentileCutoff, &F, BFI)
-              : PSI->isFunctionHotInCallGraph(&F, BFI))
-        return true;
-    } else {
-      ++NumNoProfileSummaryFuncs;
-    }
+    return (D(*Rng));
   }
-  return false;
+  if (!ClHotPercentileCutoff.getNumOccurrences())
+    return false;
+  auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+  ProfileSummaryInfo *PSI =
+      MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+  if (!PSI || !PSI->hasProfileSummary()) {
+    ++NumNoProfileSummaryFuncs;
+    return false;
+  }
+  return PSI->isFunctionHotInCallGraphNthPercentile(
+      ClHotPercentileCutoff, &F, FAM.getResult<BlockFrequencyAnalysis>(F));
 }
 
 void HWAddressSanitizer::sanitizeFunction(Function &F,
@@ -1537,7 +1524,7 @@ void HWAddressSanitizer::sanitizeFunction(Function &F,
 
   NumTotalFuncs++;
 
-  if (CSelectiveInstrumentation && selectiveInstrumentationShouldSkip(F, FAM))
+  if (selectiveInstrumentationShouldSkip(F, FAM))
     return;
 
   NumInstrumentedFuncs++;
diff --git a/llvm/lib/Transforms/Instrumentation/RemoveTrapsPass.cpp b/llvm/lib/Transforms/Instrumentation/RemoveTrapsPass.cpp
index 86292c1..6adc29f 100644
--- a/llvm/lib/Transforms/Instrumentation/RemoveTrapsPass.cpp
+++ b/llvm/lib/Transforms/Instrumentation/RemoveTrapsPass.cpp
@@ -23,13 +23,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "remove-traps"
 
-static cl::opt<int> HotPercentileCutoff(
-    "remove-traps-percentile-cutoff-hot", cl::init(0),
-    cl::desc("Alternative hot percentile cuttoff. By default "
-             "`-profile-summary-cutoff-hot` is used."));
+static cl::opt<int> HotPercentileCutoff("remove-traps-percentile-cutoff-hot",
+                                        cl::desc("Hot percentile cuttoff."));
 
 static cl::opt<float>
-    RandomRate("remove-traps-random-rate", cl::init(0.0),
+    RandomRate("remove-traps-random-rate",
                cl::desc("Probability value in the range [0.0, 1.0] of "
                         "unconditional pseudo-random checks removal."));
 
@@ -38,9 +36,11 @@ STATISTIC(NumChecksRemoved, "Number of removed checks");
 
 static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI,
                              const ProfileSummaryInfo *PSI) {
-  SmallVector<std::pair<IntrinsicInst *, Value *>, 16> ReplaceWithValue;
+  SmallVector<std::pair<IntrinsicInst *, bool>, 16> ReplaceWithValue;
   std::unique_ptr<RandomNumberGenerator> Rng;
 
+  // TODO:
+  // https://github.com/llvm/llvm-project/pull/84858#discussion_r1520603139
   auto ShouldRemove = [&](bool IsHot) {
     if (!RandomRate.getNumOccurrences())
       return IsHot;
@@ -64,18 +64,13 @@ static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI,
         bool IsHot = false;
         if (PSI) {
           uint64_t Count = BFI.getBlockProfileCount(&BB).value_or(0);
-          IsHot =
-              HotPercentileCutoff.getNumOccurrences()
-                  ? (HotPercentileCutoff > 0 &&
-                     PSI->isHotCountNthPercentile(HotPercentileCutoff, Count))
-                  : PSI->isHotCount(Count);
+          IsHot = PSI->isHotCountNthPercentile(HotPercentileCutoff, Count);
         }
 
         bool ToRemove = ShouldRemove(IsHot);
         ReplaceWithValue.push_back({
             II,
-            ToRemove ? Constant::getNullValue(II->getType())
-                     : (Constant::getAllOnesValue(II->getType())),
+            ToRemove,
         });
         if (ToRemove)
           ++NumChecksRemoved;
@@ -88,7 +83,7 @@ static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI,
   }
 
   for (auto [I, V] : ReplaceWithValue) {
-    I->replaceAllUsesWith(V);
+    I->replaceAllUsesWith(ConstantInt::getBool(I->getType(), !V));
     I->eraseFromParent();
   }
 
@@ -107,3 +102,8 @@ PreservedAnalyses RemoveTrapsPass::run(Function &F,
   return removeUbsanTraps(F, BFI, PSI) ? PreservedAnalyses::none()
                                        : PreservedAnalyses::all();
 }
+
+bool RemoveTrapsPass::IsRequested() {
+  return RandomRate.getNumOccurrences() ||
+         HotPercentileCutoff.getNumOccurrences();
+}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0834865..cb0fd06 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -124,6 +124,7 @@
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/VectorBuilder.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -248,10 +249,12 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
         clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
                    "Create lane mask using active.lane.mask intrinsic, and use "
                    "it for both data and control flow"),
-        clEnumValN(
-            TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
-            "data-and-control-without-rt-check",
-            "Similar to data-and-control, but remove the runtime check")));
+        clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
+                   "data-and-control-without-rt-check",
+                   "Similar to data-and-control, but remove the runtime check"),
+        clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
+                   "Use predicated EVL instructions for tail folding. If EVL "
+                   "is unsupported, fallback to data-without-lane-mask.")));
 
 static cl::opt<bool> MaximizeBandwidth(
     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
@@ -1505,29 +1508,62 @@ public:
 
   /// Returns the TailFoldingStyle that is best for the current loop.
   TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
-    return IVUpdateMayOverflow ? ChosenTailFoldingStyle.first
-                               : ChosenTailFoldingStyle.second;
+    if (!ChosenTailFoldingStyle)
+      return TailFoldingStyle::None;
+    return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
+                               : ChosenTailFoldingStyle->second;
   }
 
   /// Selects and saves TailFoldingStyle for 2 options - if IV update may
   /// overflow or not.
-  void setTailFoldingStyles() {
-    assert(ChosenTailFoldingStyle.first == TailFoldingStyle::None &&
-           ChosenTailFoldingStyle.second == TailFoldingStyle::None &&
-           "Tail folding must not be selected yet.");
-    if (!Legal->prepareToFoldTailByMasking())
+  /// \param IsScalableVF true if scalable vector factors enabled.
+  /// \param UserIC User specific interleave count.
+  void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
+    assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
+    if (!Legal->prepareToFoldTailByMasking()) {
+      ChosenTailFoldingStyle =
+          std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
       return;
+    }
 
-    if (ForceTailFoldingStyle.getNumOccurrences()) {
-      ChosenTailFoldingStyle.first = ChosenTailFoldingStyle.second =
-          ForceTailFoldingStyle;
+    if (!ForceTailFoldingStyle.getNumOccurrences()) {
+      ChosenTailFoldingStyle = std::make_pair(
+          TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
+          TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
       return;
     }
 
-    ChosenTailFoldingStyle.first =
-        TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true);
-    ChosenTailFoldingStyle.second =
-        TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false);
+    // Set styles when forced.
+    ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
+                                            ForceTailFoldingStyle.getValue());
+    if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
+      return;
+    // Override forced styles if needed.
+    // FIXME: use actual opcode/data type for analysis here.
+    // FIXME: Investigate opportunity for fixed vector factor.
+    bool EVLIsLegal =
+        IsScalableVF && UserIC <= 1 &&
+        TTI.hasActiveVectorLength(0, nullptr, Align()) &&
+        !EnableVPlanNativePath &&
+        // FIXME: implement support for max safe dependency distance.
+        Legal->isSafeForAnyVectorWidth() &&
+        // FIXME: remove this once reductions are supported.
+        Legal->getReductionVars().empty();
+    if (!EVLIsLegal) {
+      // If for some reason EVL mode is unsupported, fallback to
+      // DataWithoutLaneMask to try to vectorize the loop with folded tail
+      // in a generic way.
+      ChosenTailFoldingStyle =
+          std::make_pair(TailFoldingStyle::DataWithoutLaneMask,
+                         TailFoldingStyle::DataWithoutLaneMask);
+      LLVM_DEBUG(
+          dbgs()
+          << "LV: Preference for VP intrinsics indicated. Will "
+             "not try to generate VP Intrinsics "
+          << (UserIC > 1
+                  ? "since interleave count specified is greater than 1.\n"
+                  : "due to non-interleaving reasons.\n"));
+    }
   }
 
   /// Returns true if all loop blocks should be masked to fold tail loop.
@@ -1544,6 +1580,18 @@ public:
     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
   }
 
+  /// Returns true if VP intrinsics with explicit vector length support should
+  /// be generated in the tail folded loop.
+  bool foldTailWithEVL() const {
+    return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL &&
+           // FIXME: remove this once vp_reverse is supported.
+           none_of(
+               WideningDecisions,
+               [](const std::pair<std::pair<Instruction *, ElementCount>,
+                                  std::pair<InstWidening, InstructionCost>>
+                      &Data) { return Data.second.first == CM_Widen_Reverse; });
+  }
+
   /// Returns true if the Phi is part of an inloop reduction.
   bool isInLoopReduction(PHINode *Phi) const {
     return InLoopReductions.contains(Phi);
@@ -1688,8 +1736,8 @@ private:
 
   /// Control finally chosen tail folding style. The first element is used if
   /// the IV update may overflow, the second element - if it does not.
-  std::pair<TailFoldingStyle, TailFoldingStyle> ChosenTailFoldingStyle =
-      std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
+  std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
+      ChosenTailFoldingStyle;
 
   /// A map holding scalar costs for different vectorization factors. The
   /// presence of a cost for an instruction in the mapping indicates that the
@@ -4647,9 +4695,24 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   // found modulo the vectorization factor is not zero, try to fold the tail
   // by masking.
   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
-  setTailFoldingStyles();
-  if (foldTailByMasking())
+  setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
+  if (foldTailByMasking()) {
+    if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
+      LLVM_DEBUG(
+          dbgs()
+          << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
+             "try to generate VP Intrinsics with scalable vector "
+             "factors only.\n");
+      // Tail folded loop using VP intrinsics restricts the VF to be scalable
+      // for now.
+      // TODO: extend it for fixed vectors, if required.
+      assert(MaxFactors.ScalableVF.isScalable() &&
+             "Expected scalable vector factor.");
+
+      MaxFactors.FixedVF = ElementCount::getFixed(1);
+    }
     return MaxFactors;
+  }
 
   // If there was a tail-folding hint/switch, but we can't fold the tail by
   // masking, fallback to a vectorization with a scalar epilogue.
@@ -5257,6 +5320,13 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   if (!isScalarEpilogueAllowed())
     return 1;
 
+  // Do not interleave if EVL is preferred and no User IC is specified.
+  if (foldTailWithEVL()) {
+    LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
+                         "Unroll factor forced to be 1.\n");
+    return 1;
+  }
+
   // We used the distance for the interleave count.
   if (!Legal->isSafeForAnyVectorWidth())
     return 1;
@@ -8487,6 +8557,9 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
         VPlanTransforms::truncateToMinimalBitwidths(
             *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
       VPlanTransforms::optimize(*Plan, *PSE.getSE());
+      // TODO: try to put it close to addActiveLaneMask().
+      if (CM.foldTailWithEVL())
+        VPlanTransforms::addExplicitVectorLength(*Plan);
       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
       VPlans.push_back(std::move(Plan));
     }
@@ -9179,7 +9252,7 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
     State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
 
   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
-  Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
+  Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0));
   Value *DerivedIV = emitTransformedIndex(
       State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
       Kind, cast_if_present<BinaryOperator>(FPBinOp));
@@ -9307,6 +9380,52 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
 }
 
+/// Creates either vp_store or vp_scatter intrinsics calls to represent
+/// predicated store/scatter.
+static Instruction *
+lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr,
+                                Value *StoredVal, bool IsScatter, Value *Mask,
+                                Value *EVL, const Align &Alignment) {
+  CallInst *Call;
+  if (IsScatter) {
+    Call = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
+                                   Intrinsic::vp_scatter,
+                                   {StoredVal, Addr, Mask, EVL});
+  } else {
+    VectorBuilder VBuilder(Builder);
+    VBuilder.setEVL(EVL).setMask(Mask);
+    Call = cast<CallInst>(VBuilder.createVectorInstruction(
+        Instruction::Store, Type::getVoidTy(EVL->getContext()),
+        {StoredVal, Addr}));
+  }
+  Call->addParamAttr(
+      1, Attribute::getWithAlignment(Call->getContext(), Alignment));
+  return Call;
+}
+
+/// Creates either vp_load or vp_gather intrinsics calls to represent
+/// predicated load/gather.
+static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder,
+                                                   VectorType *DataTy,
+                                                   Value *Addr, bool IsGather,
+                                                   Value *Mask, Value *EVL,
+                                                   const Align &Alignment) {
+  CallInst *Call;
+  if (IsGather) {
+    Call =
+        Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
+                                nullptr, "wide.masked.gather");
+  } else {
+    VectorBuilder VBuilder(Builder);
+    VBuilder.setEVL(EVL).setMask(Mask);
+    Call = cast<CallInst>(VBuilder.createVectorInstruction(
+        Instruction::Load, DataTy, Addr, "vp.op.load"));
+  }
+  Call->addParamAttr(
+      0, Attribute::getWithAlignment(Call->getContext(), Alignment));
+  return Call;
+}
+
 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
 
@@ -9345,7 +9464,25 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
     for (unsigned Part = 0; Part < State.UF; ++Part) {
       Instruction *NewSI = nullptr;
       Value *StoredVal = State.get(StoredValue, Part);
-      if (CreateGatherScatter) {
+      // TODO: split this into several classes for better design.
+      if (State.EVL) {
+        assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
+                                "explicit vector length.");
+        assert(cast<VPInstruction>(State.EVL)->getOpcode() ==
+                   VPInstruction::ExplicitVectorLength &&
+               "EVL must be VPInstruction::ExplicitVectorLength.");
+        Value *EVL = State.get(State.EVL, VPIteration(0, 0));
+        // If EVL is not nullptr, then EVL must be a valid value set during plan
+        // creation, possibly default value = whole vector register length. EVL
+        // is created only if TTI prefers predicated vectorization, thus if EVL
+        // is not nullptr it also implies preference for predicated
+        // vectorization.
+        // FIXME: Support reverse store after vp_reverse is added.
+        Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+        NewSI = lowerStoreUsingVectorIntrinsics(
+            Builder, State.get(getAddr(), Part, !CreateGatherScatter),
+            StoredVal, CreateGatherScatter, MaskPart, EVL, Alignment);
+      } else if (CreateGatherScatter) {
         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
         Value *VectorGep = State.get(getAddr(), Part);
         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
@@ -9375,7 +9512,25 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
   State.setDebugLocFrom(getDebugLoc());
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     Value *NewLI;
-    if (CreateGatherScatter) {
+    // TODO: split this into several classes for better design.
+    if (State.EVL) {
+      assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
+                              "explicit vector length.");
+      assert(cast<VPInstruction>(State.EVL)->getOpcode() ==
+                 VPInstruction::ExplicitVectorLength &&
+             "EVL must be VPInstruction::ExplicitVectorLength.");
+      Value *EVL = State.get(State.EVL, VPIteration(0, 0));
+      // If EVL is not nullptr, then EVL must be a valid value set during plan
+      // creation, possibly default value = whole vector register length. EVL
+      // is created only if TTI prefers predicated vectorization, thus if EVL
+      // is not nullptr it also implies preference for predicated
+      // vectorization.
+      // FIXME: Support reverse loading after vp_reverse is added.
+      Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+      NewLI = lowerLoadUsingVectorIntrinsics(
+          Builder, DataTy, State.get(getAddr(), Part, !CreateGatherScatter),
+          CreateGatherScatter, MaskPart, EVL, Alignment);
+    } else if (CreateGatherScatter) {
       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
       Value *VectorGep = State.get(getAddr(), Part);
       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 6e7dcb9..bdd26ac 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -306,10 +306,7 @@ static bool isCommutative(Instruction *I) {
     return Cmp->isCommutative();
   if (auto *BO = dyn_cast<BinaryOperator>(I))
     return BO->isCommutative();
-  // TODO: This should check for generic Instruction::isCommutative(), but
-  //       we need to confirm that the caller code correctly handles Intrinsics
-  //       for example (does not have 2 operands).
-  return false;
+  return I->isCommutative();
 }
 
 /// \returns inserting index of InsertElement or InsertValue instruction,
@@ -658,6 +655,29 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
   unsigned AltOpcode = Opcode;
   unsigned AltIndex = BaseIndex;
 
+  bool SwappedPredsCompatible = [&]() {
+    if (!IsCmpOp)
+      return false;
+    SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
+    UniquePreds.insert(BasePred);
+    UniqueNonSwappedPreds.insert(BasePred);
+    for (Value *V : VL) {
+      auto *I = dyn_cast<CmpInst>(V);
+      if (!I)
+        return false;
+      CmpInst::Predicate CurrentPred = I->getPredicate();
+      CmpInst::Predicate SwappedCurrentPred =
+          CmpInst::getSwappedPredicate(CurrentPred);
+      UniqueNonSwappedPreds.insert(CurrentPred);
+      if (!UniquePreds.contains(CurrentPred) &&
+          !UniquePreds.contains(SwappedCurrentPred))
+        UniquePreds.insert(CurrentPred);
+    }
+    // Total number of predicates > 2, but if consider swapped predicates
+    // compatible only 2, consider swappable predicates as compatible opcodes,
+    // not alternate.
+    return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
+  }();
   // Check for one alternate opcode from another BinaryOperator.
   // TODO - generalize to support all operators (types, calls etc.).
   auto *IBase = cast<Instruction>(VL[BaseIndex]);
@@ -710,7 +730,7 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
         CmpInst::Predicate SwappedCurrentPred =
             CmpInst::getSwappedPredicate(CurrentPred);
 
-        if (E == 2 &&
+        if ((E == 2 || SwappedPredsCompatible) &&
             (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
           continue;
 
@@ -1087,7 +1107,7 @@ public:
     MinBWs.clear();
     ReductionBitWidth = 0;
     CastMaxMinBWSizes.reset();
-    TruncNodes.clear();
+    ExtraBitWidthNodes.clear();
     InstrElementSize.clear();
     UserIgnoreList = nullptr;
     PostponedGathers.clear();
@@ -1952,6 +1972,9 @@ public:
              "Expected same number of lanes");
       assert(isa<Instruction>(VL[0]) && "Expected instruction");
       unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
+      constexpr unsigned IntrinsicNumOperands = 2;
+      if (isa<IntrinsicInst>(VL[0]))
+        NumOperands = IntrinsicNumOperands;
       OpsVec.resize(NumOperands);
       unsigned NumLanes = VL.size();
       for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
@@ -3397,10 +3420,11 @@ private:
           // immediates do not affect scheduler behavior this is considered
           // okay.
           auto *In = BundleMember->Inst;
-          assert(In &&
-                 (isa<ExtractValueInst, ExtractElementInst>(In) ||
-                  In->getNumOperands() == TE->getNumOperands()) &&
-                 "Missed TreeEntry operands?");
+          assert(
+              In &&
+              (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
+               In->getNumOperands() == TE->getNumOperands()) &&
+              "Missed TreeEntry operands?");
           (void)In; // fake use to avoid build failure when assertions disabled
 
           for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
@@ -3659,8 +3683,9 @@ private:
   /// type sizes, used in the tree.
   std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
 
-  /// Indices of the vectorized trunc nodes.
-  DenseSet<unsigned> TruncNodes;
+  /// Indices of the vectorized nodes, which supposed to be the roots of the new
+  /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
+  DenseSet<unsigned> ExtraBitWidthNodes;
 };
 
 } // end namespace slpvectorizer
@@ -6588,7 +6613,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                 PrevMaxBW),
             std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
                                PrevMinBW));
-        TruncNodes.insert(VectorizableTree.size());
+        ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
+      } else if (ShuffleOrOp == Instruction::SIToFP ||
+                 ShuffleOrOp == Instruction::UIToFP) {
+        unsigned NumSignBits =
+            ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
+        if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
+          APInt Mask = DB->getDemandedBits(OpI);
+          NumSignBits = std::max(NumSignBits, Mask.countl_zero());
+        }
+        if (NumSignBits * 2 >=
+            DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
+          ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
       }
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                                    ReuseShuffleIndicies);
@@ -6636,6 +6672,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       TE->setOperand(1, Right);
       buildTree_rec(Left, Depth + 1, {TE, 0});
       buildTree_rec(Right, Depth + 1, {TE, 1});
+      if (ShuffleOrOp == Instruction::ICmp) {
+        unsigned NumSignBits0 =
+            ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
+        if (NumSignBits0 * 2 >=
+            DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
+          ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
+        unsigned NumSignBits1 =
+            ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
+        if (NumSignBits1 * 2 >=
+            DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
+          ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
+      }
       return;
     }
     case Instruction::Select:
@@ -6775,6 +6823,33 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                                    ReuseShuffleIndicies);
+      // Sort operands of the instructions so that each side is more likely to
+      // have the same opcode.
+      if (isCommutative(VL0)) {
+        ValueList Left, Right;
+        reorderInputsAccordingToOpcode(VL, Left, Right, *this);
+        TE->setOperand(0, Left);
+        TE->setOperand(1, Right);
+        SmallVector<ValueList> Operands;
+        for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
+          Operands.emplace_back();
+          if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
+            continue;
+          for (Value *V : VL) {
+            auto *CI2 = cast<CallInst>(V);
+            Operands.back().push_back(CI2->getArgOperand(I));
+          }
+          TE->setOperand(I, Operands.back());
+        }
+        buildTree_rec(Left, Depth + 1, {TE, 0});
+        buildTree_rec(Right, Depth + 1, {TE, 1});
+        for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
+          if (Operands[I - 2].empty())
+            continue;
+          buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
+        }
+        return;
+      }
       TE->setOperandsInOrder();
       for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
         // For scalar operands no need to create an entry since no need to
@@ -8447,7 +8522,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
       ScalarTy = IE->getOperand(1)->getType();
   }
-  if (!FixedVectorType::isValidElementType(ScalarTy))
+  if (!isValidElementType(ScalarTy))
     return InstructionCost::getInvalid();
   auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
@@ -9063,25 +9138,35 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
             cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
             E->getAltOp());
       } else {
-        Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
-        Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
-        auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
-        auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
-        if (It != MinBWs.end()) {
-          if (!MinBWs.contains(getOperandEntry(E, 0)))
-            VecCost =
-                TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, Src0Ty,
-                                        TTI::CastContextHint::None, CostKind);
-          LLVM_DEBUG({
-            dbgs() << "SLP: alternate extension, which should be truncated.\n";
-            E->dump();
-          });
-          return VecCost;
+        Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
+        auto *SrcTy = FixedVectorType::get(SrcSclTy, VL.size());
+        if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
+          auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
+          unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
+          unsigned SrcBWSz =
+              DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
+          if (SrcIt != MinBWs.end()) {
+            SrcBWSz = SrcIt->second.first;
+            SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
+            SrcTy = FixedVectorType::get(SrcSclTy, VL.size());
+          }
+          if (BWSz <= SrcBWSz) {
+            if (BWSz < SrcBWSz)
+              VecCost =
+                  TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
+                                          TTI::CastContextHint::None, CostKind);
+            LLVM_DEBUG({
+              dbgs()
+                  << "SLP: alternate extension, which should be truncated.\n";
+              E->dump();
+            });
+            return VecCost;
+          }
         }
-        VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
+        VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
                                           TTI::CastContextHint::None, CostKind);
         VecCost +=
-            TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
+            TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
                                     TTI::CastContextHint::None, CostKind);
       }
       SmallVector<int> Mask;
@@ -12591,15 +12676,20 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         CmpInst::Predicate AltPred = AltCI->getPredicate();
         V1 = Builder.CreateCmp(AltPred, LHS, RHS);
       } else {
-        if (It != MinBWs.end()) {
-          if (!MinBWs.contains(getOperandEntry(E, 0)))
-            LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
-          assert(LHS->getType() == VecTy && "Expected same type as operand.");
-          if (auto *I = dyn_cast<Instruction>(LHS))
-            LHS = propagateMetadata(I, E->Scalars);
-          E->VectorizedValue = LHS;
-          ++NumVectorInstructions;
-          return LHS;
+        if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
+          unsigned SrcBWSz = DL->getTypeSizeInBits(
+              cast<VectorType>(LHS->getType())->getElementType());
+          unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
+          if (BWSz <= SrcBWSz) {
+            if (BWSz < SrcBWSz)
+              LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
+            assert(LHS->getType() == VecTy && "Expected same type as operand.");
+            if (auto *I = dyn_cast<Instruction>(LHS))
+              LHS = propagateMetadata(I, E->Scalars);
+            E->VectorizedValue = LHS;
+            ++NumVectorInstructions;
+            return LHS;
+          }
         }
         V0 = Builder.CreateCast(
             static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
@@ -14051,6 +14141,16 @@ bool BoUpSLP::collectValuesToDemote(
       }))
     return FinalAnalysis();
 
+  if (!all_of(I->users(),
+              [=](User *U) {
+                return getTreeEntry(U) ||
+                       (UserIgnoreList && UserIgnoreList->contains(U)) ||
+                       (U->getType()->isSized() &&
+                        DL->getTypeSizeInBits(U->getType()) <= BitWidth);
+              }) &&
+      !IsPotentiallyTruncated(I, BitWidth))
+    return false;
+
   unsigned Start = 0;
   unsigned End = I->getNumOperands();
 
@@ -14097,25 +14197,52 @@ bool BoUpSLP::collectValuesToDemote(
         }
         return false;
       };
-  bool NeedToExit = false;
+  auto TryProcessInstruction =
+      [&](Instruction *I, const TreeEntry &ITE, unsigned &BitWidth,
+          ArrayRef<Value *> Operands = std::nullopt,
+          function_ref<bool(unsigned, unsigned)> Checker = {}) {
+        if (Operands.empty()) {
+          if (!IsTruncRoot)
+            MaxDepthLevel = 1;
+          (void)IsPotentiallyTruncated(V, BitWidth);
+        } else {
+          // Several vectorized uses? Check if we can truncate it, otherwise -
+          // exit.
+          if (ITE.UserTreeIndices.size() > 1 &&
+              !IsPotentiallyTruncated(I, BitWidth))
+            return false;
+          bool NeedToExit = false;
+          if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
+            return false;
+          if (NeedToExit)
+            return true;
+          if (!ProcessOperands(Operands, NeedToExit))
+            return false;
+          if (NeedToExit)
+            return true;
+        }
+
+        ++MaxDepthLevel;
+        // Gather demoted constant operands.
+        for (unsigned Idx : seq<unsigned>(Start, End))
+          if (isa<Constant>(I->getOperand(Idx)))
+            DemotedConsts.try_emplace(I).first->getSecond().push_back(Idx);
+        // Record the value that we can demote.
+        ToDemote.push_back(V);
+        return IsProfitableToDemote;
+      };
   switch (I->getOpcode()) {
 
   // We can always demote truncations and extensions. Since truncations can
   // seed additional demotion, we save the truncated value.
   case Instruction::Trunc:
-    if (!IsTruncRoot)
-      MaxDepthLevel = 1;
     if (IsProfitableToDemoteRoot)
       IsProfitableToDemote = true;
-    (void)IsPotentiallyTruncated(V, BitWidth);
-    break;
+    return TryProcessInstruction(I, *ITE, BitWidth);
   case Instruction::ZExt:
   case Instruction::SExt:
-    if (!IsTruncRoot)
-      MaxDepthLevel = 1;
     IsProfitableToDemote = true;
-    (void)IsPotentiallyTruncated(V, BitWidth);
-    break;
+    return TryProcessInstruction(I, *ITE, BitWidth);
 
   // We can demote certain binary operations if we can demote both of their
   // operands.
@@ -14125,140 +14252,83 @@ bool BoUpSLP::collectValuesToDemote(
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor: {
-    if (ITE->UserTreeIndices.size() > 1 && !IsPotentiallyTruncated(I, BitWidth))
-      return false;
-    if (!ProcessOperands({I->getOperand(0), I->getOperand(1)}, NeedToExit))
-      return false;
-    break;
+    return TryProcessInstruction(I, *ITE, BitWidth,
+                                 {I->getOperand(0), I->getOperand(1)});
   }
   case Instruction::Shl: {
-    // Several vectorized uses? Check if we can truncate it, otherwise - exit.
-    if (ITE->UserTreeIndices.size() > 1 && !IsPotentiallyTruncated(I, BitWidth))
-      return false;
     // If we are truncating the result of this SHL, and if it's a shift of an
     // inrange amount, we can always perform a SHL in a smaller type.
-    if (!AttemptCheckBitwidth(
-            [&](unsigned BitWidth, unsigned) {
-              KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
-              return AmtKnownBits.getMaxValue().ult(BitWidth);
-            },
-            NeedToExit))
-      return false;
-    if (NeedToExit)
-      return true;
-    if (!ProcessOperands({I->getOperand(0), I->getOperand(1)}, NeedToExit))
-      return false;
-    break;
+    auto ShlChecker = [&](unsigned BitWidth, unsigned) {
+      KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
+      return AmtKnownBits.getMaxValue().ult(BitWidth);
+    };
+    return TryProcessInstruction(
+        I, *ITE, BitWidth, {I->getOperand(0), I->getOperand(1)}, ShlChecker);
   }
   case Instruction::LShr: {
-    // Several vectorized uses? Check if we can truncate it, otherwise - exit.
-    if (ITE->UserTreeIndices.size() > 1 && !IsPotentiallyTruncated(I, BitWidth))
-      return false;
     // If this is a truncate of a logical shr, we can truncate it to a smaller
     // lshr iff we know that the bits we would otherwise be shifting in are
     // already zeros.
-    if (!AttemptCheckBitwidth(
-            [&](unsigned BitWidth, unsigned OrigBitWidth) {
-              KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
-              APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
-              return AmtKnownBits.getMaxValue().ult(BitWidth) &&
-                     MaskedValueIsZero(I->getOperand(0), ShiftedBits,
-                                       SimplifyQuery(*DL));
-            },
-            NeedToExit))
-      return false;
-    if (NeedToExit)
-      return true;
-    if (!ProcessOperands({I->getOperand(0), I->getOperand(1)}, NeedToExit))
-      return false;
-    break;
+    auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
+      KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
+      APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
+      return AmtKnownBits.getMaxValue().ult(BitWidth) &&
+             MaskedValueIsZero(I->getOperand(0), ShiftedBits,
+                               SimplifyQuery(*DL));
+    };
+    return TryProcessInstruction(
+        I, *ITE, BitWidth, {I->getOperand(0), I->getOperand(1)}, LShrChecker);
   }
   case Instruction::AShr: {
-    // Several vectorized uses? Check if we can truncate it, otherwise - exit.
-    if (ITE->UserTreeIndices.size() > 1 && !IsPotentiallyTruncated(I, BitWidth))
-      return false;
     // If this is a truncate of an arithmetic shr, we can truncate it to a
     // smaller ashr iff we know that all the bits from the sign bit of the
     // original type and the sign bit of the truncate type are similar.
-    if (!AttemptCheckBitwidth(
-            [&](unsigned BitWidth, unsigned OrigBitWidth) {
-              KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
-              unsigned ShiftedBits = OrigBitWidth - BitWidth;
-              return AmtKnownBits.getMaxValue().ult(BitWidth) &&
-                     ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0,
-                                                      AC, nullptr, DT);
-            },
-            NeedToExit))
-      return false;
-    if (NeedToExit)
-      return true;
-    if (!ProcessOperands({I->getOperand(0), I->getOperand(1)}, NeedToExit))
-      return false;
-    break;
+    auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
+      KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
+      unsigned ShiftedBits = OrigBitWidth - BitWidth;
+      return AmtKnownBits.getMaxValue().ult(BitWidth) &&
+             ShiftedBits <
+                 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
+    };
+    return TryProcessInstruction(
+        I, *ITE, BitWidth, {I->getOperand(0), I->getOperand(1)}, AShrChecker);
   }
   case Instruction::UDiv:
   case Instruction::URem: {
-    if (ITE->UserTreeIndices.size() > 1 && !IsPotentiallyTruncated(I, BitWidth))
-      return false;
     // UDiv and URem can be truncated if all the truncated bits are zero.
-    if (!AttemptCheckBitwidth(
-            [&](unsigned BitWidth, unsigned OrigBitWidth) {
-              assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
-              APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
-              return MaskedValueIsZero(I->getOperand(0), Mask,
-                                       SimplifyQuery(*DL)) &&
-                     MaskedValueIsZero(I->getOperand(1), Mask,
-                                       SimplifyQuery(*DL));
-            },
-            NeedToExit))
-      return false;
-    if (NeedToExit)
-      return true;
-    if (!ProcessOperands({I->getOperand(0), I->getOperand(1)}, NeedToExit))
-      return false;
-    break;
+    auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
+      assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
+      APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
+      return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
+             MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
+    };
+    return TryProcessInstruction(I, *ITE, BitWidth,
+                                 {I->getOperand(0), I->getOperand(1)}, Checker);
   }
 
   // We can demote selects if we can demote their true and false values.
   case Instruction::Select: {
-    if (ITE->UserTreeIndices.size() > 1 && !IsPotentiallyTruncated(I, BitWidth))
-      return false;
     Start = 1;
     auto *SI = cast<SelectInst>(I);
-    if (!ProcessOperands({SI->getTrueValue(), SI->getFalseValue()}, NeedToExit))
-      return false;
-    break;
+    return TryProcessInstruction(I, *ITE, BitWidth,
+                                 {SI->getTrueValue(), SI->getFalseValue()});
   }
 
   // We can demote phis if we can demote all their incoming operands. Note that
   // we don't need to worry about cycles since we ensure single use above.
   case Instruction::PHI: {
     PHINode *PN = cast<PHINode>(I);
-    if (ITE->UserTreeIndices.size() > 1 && !IsPotentiallyTruncated(I, BitWidth))
-      return false;
     SmallVector<Value *> Ops(PN->incoming_values().begin(),
                              PN->incoming_values().end());
-    if (!ProcessOperands(Ops, NeedToExit))
-      return false;
-    break;
+    return TryProcessInstruction(I, *ITE, BitWidth, Ops);
   }
 
   // Otherwise, conservatively give up.
   default:
-    MaxDepthLevel = 1;
-    return FinalAnalysis();
+    break;
   }
-  if (NeedToExit)
-    return true;
-
-  ++MaxDepthLevel;
-  // Gather demoted constant operands.
-  for (unsigned Idx : seq<unsigned>(Start, End))
-    if (isa<Constant>(I->getOperand(Idx)))
-      DemotedConsts.try_emplace(I).first->getSecond().push_back(Idx);
-  // Record the value that we can demote.
-  ToDemote.push_back(V);
-  return IsProfitableToDemote;
+  MaxDepthLevel = 1;
+  return FinalAnalysis();
 }
 
 void BoUpSLP::computeMinimumValueSizes() {
@@ -14266,7 +14336,8 @@ void BoUpSLP::computeMinimumValueSizes() {
   bool IsStoreOrInsertElt =
       VectorizableTree.front()->getOpcode() == Instruction::Store ||
       VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
-  if ((IsStoreOrInsertElt || UserIgnoreList) && TruncNodes.size() <= 1 &&
+  if ((IsStoreOrInsertElt || UserIgnoreList) &&
+      ExtraBitWidthNodes.size() <= 1 &&
       (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
        CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
     return;
@@ -14309,7 +14380,8 @@ void BoUpSLP::computeMinimumValueSizes() {
   DenseMap<Instruction *, SmallVector<unsigned>> DemotedConsts;
   auto ComputeMaxBitWidth = [&](ArrayRef<Value *> TreeRoot, unsigned VF,
                                 bool IsTopRoot, bool IsProfitableToDemoteRoot,
-                                unsigned Opcode, unsigned Limit, bool IsTruncRoot) {
+                                unsigned Opcode, unsigned Limit,
+                                bool IsTruncRoot) {
     ToDemote.clear();
     auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
     if (!TreeRootIT || !Opcode)
@@ -14469,16 +14541,23 @@ void BoUpSLP::computeMinimumValueSizes() {
     IsTopRoot = false;
     IsProfitableToDemoteRoot = true;
 
-    if (TruncNodes.empty()) {
+    if (ExtraBitWidthNodes.empty()) {
       NodeIdx = VectorizableTree.size();
     } else {
       unsigned NewIdx = 0;
       do {
-        NewIdx = *TruncNodes.begin() + 1;
-        TruncNodes.erase(TruncNodes.begin());
-      } while (NewIdx <= NodeIdx && !TruncNodes.empty());
+        NewIdx = *ExtraBitWidthNodes.begin();
+        ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
+      } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
       NodeIdx = NewIdx;
-      IsTruncRoot = true;
+      IsTruncRoot =
+          NodeIdx < VectorizableTree.size() &&
+          any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
+                 [](const EdgeInfo &EI) {
+                   return EI.EdgeIdx == 0 &&
+                          EI.UserTE->getOpcode() == Instruction::Trunc &&
+                          !EI.UserTE->isAltShuffle();
+                 });
     }
 
     // If the maximum bit width we compute is less than the with of the roots'
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 9a8f53c..8ebd75d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -358,23 +358,14 @@ void VPTransformState::addNewMetadata(Instruction *To,
     LVer->annotateInstWithNoAlias(To, Orig);
 }
 
-void VPTransformState::addMetadata(Instruction *To, Instruction *From) {
+void VPTransformState::addMetadata(Value *To, Instruction *From) {
   // No source instruction to transfer metadata from?
   if (!From)
     return;
 
-  propagateMetadata(To, From);
-  addNewMetadata(To, From);
-}
-
-void VPTransformState::addMetadata(ArrayRef<Value *> To, Instruction *From) {
-  // No source instruction to transfer metadata from?
-  if (!From)
-    return;
-
-  for (Value *V : To) {
-    if (Instruction *I = dyn_cast<Instruction>(V))
-      addMetadata(I, From);
+  if (Instruction *ToI = dyn_cast<Instruction>(To)) {
+    propagateMetadata(ToI, From);
+    addNewMetadata(ToI, From);
   }
 }
 
@@ -880,13 +871,15 @@ void VPlan::execute(VPTransformState *State) {
     // only a single part is generated, which provides the last part from the
     // previous iteration. For non-ordered reductions all UF parts are
     // generated.
-    bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
-                            isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) ||
-                            (isa<VPReductionPHIRecipe>(PhiR) &&
-                             cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
-    bool NeedsScalar = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
-                       (isa<VPReductionPHIRecipe>(PhiR) &&
-                        cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
+    bool SinglePartNeeded =
+        isa<VPCanonicalIVPHIRecipe>(PhiR) ||
+        isa<VPFirstOrderRecurrencePHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
+        (isa<VPReductionPHIRecipe>(PhiR) &&
+         cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
+    bool NeedsScalar =
+        isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
+        (isa<VPReductionPHIRecipe>(PhiR) &&
+         cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
     unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
 
     for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 707a826..77577b5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -242,6 +242,15 @@ struct VPTransformState {
   ElementCount VF;
   unsigned UF;
 
+  /// If EVL (Explicit Vector Length) is not nullptr, then EVL must be a valid
+  /// value set during plan transformation, possibly a default value = whole
+  /// vector register length. EVL is created only if TTI prefers predicated
+  /// vectorization, thus if EVL is not nullptr it also implies preference for
+  /// predicated vectorization.
+  /// TODO: this is a temporarily solution, the EVL must be explicitly used by
+  /// the recipes and must be removed here.
+  VPValue *EVL = nullptr;
+
   /// Hold the indices to generate specific scalar instructions. Null indicates
   /// that all instances are to be generated, using either scalar or vector
   /// instructions.
@@ -346,11 +355,7 @@ struct VPTransformState {
   /// This includes both the original MDs from \p From and additional ones (\see
   /// addNewMetadata).  Use this for *newly created* instructions in the vector
   /// loop.
-  void addMetadata(Instruction *To, Instruction *From);
-
-  /// Similar to the previous function but it adds the metadata to a
-  /// vector of instructions.
-  void addMetadata(ArrayRef<Value *> To, Instruction *From);
+  void addMetadata(Value *To, Instruction *From);
 
   /// Set the debug location in the builder using the debug location \p DL.
   void setDebugLocFrom(DebugLoc DL);
@@ -1163,6 +1168,7 @@ public:
     SLPLoad,
     SLPStore,
     ActiveLaneMask,
+    ExplicitVectorLength,
     CalculateTripCountMinusVF,
     // Increment the canonical IV separately for each unrolled part.
     CanonicalIVIncrementForPart,
@@ -2493,6 +2499,45 @@ public:
 #endif
 };
 
+/// A recipe for generating the phi node for the current index of elements,
+/// adjusted in accordance with EVL value. It starts at the start value of the
+/// canonical induction and gets incremented by EVL in each iteration of the
+/// vector loop.
+class VPEVLBasedIVPHIRecipe : public VPHeaderPHIRecipe {
+public:
+  VPEVLBasedIVPHIRecipe(VPValue *StartIV, DebugLoc DL)
+      : VPHeaderPHIRecipe(VPDef::VPEVLBasedIVPHISC, nullptr, StartIV, DL) {}
+
+  ~VPEVLBasedIVPHIRecipe() override = default;
+
+  VPEVLBasedIVPHIRecipe *clone() override {
+    llvm_unreachable("cloning not implemented yet");
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPEVLBasedIVPHISC)
+
+  static inline bool classof(const VPHeaderPHIRecipe *D) {
+    return D->getVPDefID() == VPDef::VPEVLBasedIVPHISC;
+  }
+
+  /// Generate phi for handling IV based on EVL over iterations correctly.
+  /// TODO: investigate if it can share the code with VPCanonicalIVPHIRecipe.
+  void execute(VPTransformState &State) override;
+
+  /// Returns true if the recipe only uses the first lane of operand \p Op.
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return true;
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A Recipe for widening the canonical induction variable of the vector loop.
 class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe {
 public:
@@ -2526,8 +2571,8 @@ public:
   }
 };
 
-/// A recipe for converting the canonical IV value to the corresponding value of
-/// an IV with different start and step values, using Start + CanonicalIV *
+/// A recipe for converting the input value \p IV value to the corresponding
+/// value of an IV with different start and step values, using Start + IV *
 /// Step.
 class VPDerivedIVRecipe : public VPSingleDefRecipe {
   /// Kind of the induction.
@@ -2545,16 +2590,16 @@ public:
             Start, CanonicalIV, Step) {}
 
   VPDerivedIVRecipe(InductionDescriptor::InductionKind Kind,
-                    const FPMathOperator *FPBinOp, VPValue *Start,
-                    VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step)
-      : VPSingleDefRecipe(VPDef::VPDerivedIVSC, {Start, CanonicalIV, Step}),
-        Kind(Kind), FPBinOp(FPBinOp) {}
+                    const FPMathOperator *FPBinOp, VPValue *Start, VPValue *IV,
+                    VPValue *Step)
+      : VPSingleDefRecipe(VPDef::VPDerivedIVSC, {Start, IV, Step}), Kind(Kind),
+        FPBinOp(FPBinOp) {}
 
   ~VPDerivedIVRecipe() override = default;
 
   VPRecipeBase *clone() override {
-    return new VPDerivedIVRecipe(Kind, FPBinOp, getStartValue(),
-                                 getCanonicalIV(), getStepValue());
+    return new VPDerivedIVRecipe(Kind, FPBinOp, getStartValue(), getOperand(1),
+                                 getStepValue());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPDerivedIVSC)
@@ -2574,9 +2619,6 @@ public:
   }
 
   VPValue *getStartValue() const { return getOperand(0); }
-  VPCanonicalIVPHIRecipe *getCanonicalIV() const {
-    return cast<VPCanonicalIVPHIRecipe>(getOperand(1));
-  }
   VPValue *getStepValue() const { return getOperand(2); }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 04e3031..c8ae2ee 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -216,14 +216,14 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
   Type *ResultTy =
       TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
           .Case<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe,
-                VPReductionPHIRecipe, VPWidenPointerInductionRecipe>(
-              [this](const auto *R) {
-                // Handle header phi recipes, except VPWienIntOrFpInduction
-                // which needs special handling due it being possibly truncated.
-                // TODO: consider inferring/caching type of siblings, e.g.,
-                // backedge value, here and in cases below.
-                return inferScalarType(R->getStartValue());
-              })
+                VPReductionPHIRecipe, VPWidenPointerInductionRecipe,
+                VPEVLBasedIVPHIRecipe>([this](const auto *R) {
+            // Handle header phi recipes, except VPWidenIntOrFpInduction
+            // which needs special handling due it being possibly truncated.
+            // TODO: consider inferring/caching type of siblings, e.g.,
+            // backedge value, here and in cases below.
+            return inferScalarType(R->getStartValue());
+          })
           .Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(
               [](const auto *R) { return R->getScalarType(); })
           .Case<VPPredInstPHIRecipe, VPWidenPHIRecipe, VPScalarIVStepsRecipe,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 124ae31..1be0287 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -286,6 +286,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::ComputeReductionResult:
   case VPInstruction::PtrAdd:
+  case VPInstruction::ExplicitVectorLength:
     return true;
   default:
     return false;
@@ -386,6 +387,33 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
     return Builder.CreateSelect(Cmp, Sub, Zero);
   }
+  case VPInstruction::ExplicitVectorLength: {
+    // Compute EVL
+    auto GetEVL = [=](VPTransformState &State, Value *AVL) {
+      assert(AVL->getType()->isIntegerTy() &&
+             "Requested vector length should be an integer.");
+
+      // TODO: Add support for MaxSafeDist for correct loop emission.
+      assert(State.VF.isScalable() && "Expected scalable vector factor.");
+      Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
+
+      Value *EVL = State.Builder.CreateIntrinsic(
+          State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
+          {AVL, VFArg, State.Builder.getTrue()});
+      return EVL;
+    };
+    // TODO: Restructure this code with an explicit remainder loop, vsetvli can
+    // be outside of the main loop.
+    assert(Part == 0 && "No unrolling expected for predicated vectorization.");
+    // Compute VTC - IV as the AVL (requested vector length).
+    Value *Index = State.get(getOperand(0), VPIteration(0, 0));
+    Value *TripCount = State.get(getOperand(1), VPIteration(0, 0));
+    Value *AVL = State.Builder.CreateSub(TripCount, Index);
+    Value *EVL = GetEVL(State, AVL);
+    assert(!State.EVL && "multiple EVL recipes");
+    State.EVL = this;
+    return EVL;
+  }
   case VPInstruction::CanonicalIVIncrementForPart: {
     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
     if (Part == 0)
@@ -592,6 +620,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
     // TODO: Cover additional opcodes.
     return vputils::onlyFirstLaneUsed(this);
   case VPInstruction::ActiveLaneMask:
+  case VPInstruction::ExplicitVectorLength:
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::BranchOnCount:
@@ -628,6 +657,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::ActiveLaneMask:
     O << "active lane mask";
     break;
+  case VPInstruction::ExplicitVectorLength:
+    O << "EXPLICIT-VECTOR-LENGTH";
+    break;
   case VPInstruction::FirstOrderRecurrenceSplice:
     O << "first-order splice";
     break;
@@ -1184,7 +1216,7 @@ void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
   O << Indent << "= DERIVED-IV ";
   getStartValue()->printAsOperand(O, SlotTracker);
   O << " + ";
-  getCanonicalIV()->printAsOperand(O, SlotTracker);
+  getOperand(1)->printAsOperand(O, SlotTracker);
   O << " * ";
   getStepValue()->printAsOperand(O, SlotTracker);
 }
@@ -1974,3 +2006,25 @@ void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
   printOperands(O, SlotTracker);
 }
 #endif
+
+void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
+  BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+  assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization.");
+  Value *Start = State.get(getOperand(0), VPIteration(0, 0));
+  PHINode *EntryPart =
+      State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv");
+  EntryPart->addIncoming(Start, VectorPH);
+  EntryPart->setDebugLoc(getDebugLoc());
+  State.set(this, EntryPart, 0, /*IsScalar=*/true);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+                                  VPSlotTracker &SlotTracker) const {
+  O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
+
+  printAsOperand(O, SlotTracker);
+  O << " = phi ";
+  printOperands(O, SlotTracker);
+}
+#endif
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 957c97cd..1256e4d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -472,6 +472,26 @@ static void removeRedundantCanonicalIVs(VPlan &Plan) {
   }
 }
 
+/// Returns true if \p R is dead and can be removed.
+static bool isDeadRecipe(VPRecipeBase &R) {
+  using namespace llvm::PatternMatch;
+  // Do remove conditional assume instructions as their conditions may be
+  // flattened.
+  auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
+  bool IsConditionalAssume =
+      RepR && RepR->isPredicated() &&
+      match(RepR->getUnderlyingInstr(), m_Intrinsic<Intrinsic::assume>());
+  if (IsConditionalAssume)
+    return true;
+
+  if (R.mayHaveSideEffects())
+    return false;
+
+  // Recipe is dead if no user keeps the recipe alive.
+  return all_of(R.definedValues(),
+                [](VPValue *V) { return V->getNumUsers() == 0; });
+}
+
 static void removeDeadRecipes(VPlan &Plan) {
   ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
       Plan.getEntry());
@@ -480,22 +500,8 @@ static void removeDeadRecipes(VPlan &Plan) {
     // The recipes in the block are processed in reverse order, to catch chains
     // of dead recipes.
     for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
-      // A user keeps R alive:
-      if (any_of(R.definedValues(),
-                 [](VPValue *V) { return V->getNumUsers(); }))
-        continue;
-
-      using namespace llvm::PatternMatch;
-      // Having side effects keeps R alive, but do remove conditional assume
-      // instructions as their conditions may be flattened.
-      auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
-      bool IsConditionalAssume =
-          RepR && RepR->isPredicated() &&
-          match(RepR->getUnderlyingInstr(), m_Intrinsic<Intrinsic::assume>());
-      if (R.mayHaveSideEffects() && !IsConditionalAssume)
-        continue;
-
-      R.eraseFromParent();
+      if (isDeadRecipe(R))
+        R.eraseFromParent();
     }
   }
 }
@@ -635,6 +641,25 @@ static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {
   }
 }
 
+static void recursivelyDeleteDeadRecipes(VPValue *V) {
+  SmallVector<VPValue *> WorkList;
+  SmallPtrSet<VPValue *, 8> Seen;
+  WorkList.push_back(V);
+
+  while (!WorkList.empty()) {
+    VPValue *Cur = WorkList.pop_back_val();
+    if (!Seen.insert(Cur).second)
+      continue;
+    VPRecipeBase *R = Cur->getDefiningRecipe();
+    if (!R)
+      continue;
+    if (!isDeadRecipe(*R))
+      continue;
+    WorkList.append(R->op_begin(), R->op_end());
+    R->eraseFromParent();
+  }
+}
+
 void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
                                          unsigned BestUF,
                                          PredicatedScalarEvolution &PSE) {
@@ -668,7 +693,11 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
   auto *BOC =
       new VPInstruction(VPInstruction::BranchOnCond,
                         {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))});
+
+  SmallVector<VPValue *> PossiblyDead(Term->operands());
   Term->eraseFromParent();
+  for (VPValue *Op : PossiblyDead)
+    recursivelyDeleteDeadRecipes(Op);
   ExitingVPBB->appendRecipe(BOC);
   Plan.setVF(BestVF);
   Plan.setUF(BestUF);
@@ -1180,6 +1209,45 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
   return LaneMaskPhi;
 }
 
+/// Replaces (ICMP_ULE, WideCanonicalIV, backedge-taken-count) pattern using
+/// the given \p Idiom.
+static void
+replaceHeaderPredicateWith(VPlan &Plan, VPValue &Idiom,
+                           function_ref<bool(VPUser &, unsigned)> Cond = {}) {
+  auto *FoundWidenCanonicalIVUser =
+      find_if(Plan.getCanonicalIV()->users(),
+              [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
+  if (FoundWidenCanonicalIVUser == Plan.getCanonicalIV()->users().end())
+    return;
+  auto *WideCanonicalIV =
+      cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
+  // Walk users of WideCanonicalIV and replace all compares of the form
+  // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with
+  // the given idiom VPValue.
+  VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
+  for (VPUser *U : SmallVector<VPUser *>(WideCanonicalIV->users())) {
+    auto *CompareToReplace = dyn_cast<VPInstruction>(U);
+    if (!CompareToReplace ||
+        CompareToReplace->getOpcode() != Instruction::ICmp ||
+        CompareToReplace->getPredicate() != CmpInst::ICMP_ULE ||
+        CompareToReplace->getOperand(1) != BTC)
+      continue;
+
+    assert(CompareToReplace->getOperand(0) == WideCanonicalIV &&
+           "WidenCanonicalIV must be the first operand of the compare");
+    if (Cond) {
+      CompareToReplace->replaceUsesWithIf(&Idiom, Cond);
+      if (!CompareToReplace->getNumUsers())
+        CompareToReplace->eraseFromParent();
+    } else {
+      CompareToReplace->replaceAllUsesWith(&Idiom);
+      CompareToReplace->eraseFromParent();
+    }
+  }
+  if (!WideCanonicalIV->getNumUsers())
+    WideCanonicalIV->eraseFromParent();
+}
+
 void VPlanTransforms::addActiveLaneMask(
     VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
     bool DataAndControlFlowWithoutRuntimeCheck) {
@@ -1209,20 +1277,77 @@ void VPlanTransforms::addActiveLaneMask(
   // Walk users of WideCanonicalIV and replace all compares of the form
   // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an
   // active-lane-mask.
-  VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
-  for (VPUser *U : SmallVector<VPUser *>(WideCanonicalIV->users())) {
-    auto *CompareToReplace = dyn_cast<VPInstruction>(U);
-    if (!CompareToReplace ||
-        CompareToReplace->getOpcode() != Instruction::ICmp ||
-        CompareToReplace->getPredicate() != CmpInst::ICMP_ULE ||
-        CompareToReplace->getOperand(1) != BTC)
-      continue;
+  replaceHeaderPredicateWith(Plan, *LaneMask);
+}
 
-    assert(CompareToReplace->getOperand(0) == WideCanonicalIV &&
-           "WidenCanonicalIV must be the first operand of the compare");
-    CompareToReplace->replaceAllUsesWith(LaneMask);
-    CompareToReplace->eraseFromParent();
+/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
+/// replaces all uses except the canonical IV increment of
+/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
+/// is used only for loop iterations counting after this transformation.
+///
+/// The function uses the following definitions:
+///  %StartV is the canonical induction start value.
+///
+/// The function adds the following recipes:
+///
+/// vector.ph:
+/// ...
+///
+/// vector.body:
+/// ...
+/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
+///                                               [ %NextEVLIV, %vector.body ]
+/// %VPEVL = EXPLICIT-VECTOR-LENGTH %EVLPhi, original TC
+/// ...
+/// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
+/// ...
+///
+void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) {
+  VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+  auto *CanonicalIVPHI = Plan.getCanonicalIV();
+  VPValue *StartV = CanonicalIVPHI->getStartValue();
+
+  // TODO: revisit this and try to remove the mask operand.
+  // Walk VPWidenMemoryInstructionRecipe users of WideCanonicalIV and replace
+  // all compares of the form (ICMP_ULE, WideCanonicalIV, backedge-taken-count),
+  // used as mask in VPWidenMemoryInstructionRecipe, with an all-true-mask.
+  Value *TrueMask =
+      ConstantInt::getTrue(CanonicalIVPHI->getScalarType()->getContext());
+  VPValue *VPTrueMask = Plan.getOrAddLiveIn(TrueMask);
+  replaceHeaderPredicateWith(Plan, *VPTrueMask, [](VPUser &U, unsigned) {
+    return isa<VPWidenMemoryInstructionRecipe>(U);
+  });
+  // Now create the ExplicitVectorLengthPhi recipe in the main loop.
+  auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
+  EVLPhi->insertAfter(CanonicalIVPHI);
+  auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength,
+                                  {EVLPhi, Plan.getTripCount()});
+  VPEVL->insertBefore(*Header, Header->getFirstNonPhi());
+
+  auto *CanonicalIVIncrement =
+      cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
+  VPSingleDefRecipe *OpVPEVL = VPEVL;
+  if (unsigned IVSize = CanonicalIVPHI->getScalarType()->getScalarSizeInBits();
+      IVSize != 32) {
+    OpVPEVL = new VPScalarCastRecipe(IVSize < 32 ? Instruction::Trunc
+                                                 : Instruction::ZExt,
+                                     OpVPEVL, CanonicalIVPHI->getScalarType());
+    OpVPEVL->insertBefore(CanonicalIVIncrement);
   }
+  auto *NextEVLIV =
+      new VPInstruction(Instruction::Add, {OpVPEVL, EVLPhi},
+                        {CanonicalIVIncrement->hasNoUnsignedWrap(),
+                         CanonicalIVIncrement->hasNoSignedWrap()},
+                        CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
+  NextEVLIV->insertBefore(CanonicalIVIncrement);
+  EVLPhi->addOperand(NextEVLIV);
+
+  // Replace all uses of VPCanonicalIVPHIRecipe by
+  // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
+  CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
+  CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
+  // TODO: support unroll factor > 1.
+  Plan.setUF(1);
 }
 
 void VPlanTransforms::dropPoisonGeneratingRecipes(
@@ -1248,9 +1373,7 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
       // handled.
       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
           isa<VPInterleaveRecipe>(CurRec) ||
-          isa<VPScalarIVStepsRecipe>(CurRec) ||
-          isa<VPCanonicalIVPHIRecipe>(CurRec) ||
-          isa<VPActiveLaneMaskPHIRecipe>(CurRec))
+          isa<VPScalarIVStepsRecipe>(CurRec) || isa<VPHeaderPHIRecipe>(CurRec))
         continue;
 
       // This recipe contributes to the address computation of a widen
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index ff83c3f..0cbc707 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -98,6 +98,13 @@ struct VPlanTransforms {
   ///       VPlan directly.
   static void dropPoisonGeneratingRecipes(
       VPlan &Plan, function_ref<bool(BasicBlock *)> BlockNeedsPredication);
+
+  /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
+  /// replaces all uses except the canonical IV increment of
+  /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe.
+  /// VPCanonicalIVPHIRecipe is only used to control the loop after
+  /// this transformation.
+  static void addExplicitVectorLength(VPlan &Plan);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 1d2c17e..8b221d3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -368,6 +368,7 @@ public:
     // VPHeaderPHIRecipe need to be kept together.
     VPCanonicalIVPHISC,
     VPActiveLaneMaskPHISC,
+    VPEVLBasedIVPHISC,
     VPFirstOrderRecurrencePHISC,
     VPWidenIntOrFpInductionSC,
     VPWidenPointerInductionSC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 7ebdb91..12d37fa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -92,7 +92,50 @@ static bool verifyVPBasicBlock(const VPBasicBlock *VPBB,
   for (const VPRecipeBase &R : *VPBB)
     RecipeNumbering[&R] = Cnt++;
 
+  // Set of recipe types along with VPInstruction Opcodes of all EVL-related
+  // recipes that must appear at most once in the header block.
+  DenseSet<unsigned> EVLFound;
+  const VPRecipeBase *VPWidenMemRecipe = nullptr;
+  const VPlan *Plan = VPBB->getPlan();
+  bool IsHeader = Plan->getEntry()->getNumSuccessors() == 1 &&
+                  Plan->getVectorLoopRegion()->getEntry() == VPBB;
+  auto CheckEVLRecipiesInsts = [&](const VPRecipeBase *R) {
+    if (isa<VPEVLBasedIVPHIRecipe>(R)) {
+      if (!IsHeader) {
+        errs() << "EVL PHI recipe not in entry block!\n";
+        return false;
+      }
+      if (!EVLFound.insert(VPDef::VPEVLBasedIVPHISC).second) {
+        errs() << "EVL PHI recipe inserted more than once!\n";
+        return false;
+      }
+      return true;
+    }
+    if (const auto *RInst = dyn_cast<VPInstruction>(R);
+        RInst && RInst->getOpcode() == VPInstruction::ExplicitVectorLength) {
+      if (!IsHeader) {
+        errs() << "EVL instruction not in the header block!\n";
+        return false;
+      }
+      if (!EVLFound.insert(RInst->getOpcode() + VPDef::VPLastPHISC).second) {
+        errs() << "EVL instruction inserted more than once!\n";
+        return false;
+      }
+      if (VPWidenMemRecipe) {
+        errs() << "Use of EVL instruction by widen memory recipe before "
+                  "definition!\n";
+        return false;
+      }
+      return true;
+    }
+    if (isa<VPWidenMemoryInstructionRecipe>(R))
+      VPWidenMemRecipe = R;
+    return true;
+  };
+
   for (const VPRecipeBase &R : *VPBB) {
+    if (!CheckEVLRecipiesInsts(&R))
+      return false;
     for (const VPValue *V : R.definedValues()) {
       for (const VPUser *U : V->users()) {
         auto *UI = dyn_cast<VPRecipeBase>(U);
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index af5e7c9..3738220 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -112,6 +112,7 @@ private:
   bool foldSingleElementStore(Instruction &I);
   bool scalarizeLoadExtract(Instruction &I);
   bool foldShuffleOfBinops(Instruction &I);
+  bool foldShuffleOfCastops(Instruction &I);
   bool foldShuffleFromReductions(Instruction &I);
   bool foldTruncFromReductions(Instruction &I);
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
@@ -1432,6 +1433,75 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
   return true;
 }
 
+/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
+/// into "castop (shuffle)".
+bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
+  Value *V0, *V1;
+  ArrayRef<int> Mask;
+  if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
+                           m_Mask(Mask))))
+    return false;
+
+  auto *C0 = dyn_cast<CastInst>(V0);
+  auto *C1 = dyn_cast<CastInst>(V1);
+  if (!C0 || !C1)
+    return false;
+
+  Instruction::CastOps Opcode = C0->getOpcode();
+  if (Opcode == Instruction::BitCast || C0->getSrcTy() != C1->getSrcTy())
+    return false;
+
+  // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
+  if (Opcode != C1->getOpcode()) {
+    if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
+      Opcode = Instruction::SExt;
+    else
+      return false;
+  }
+
+  auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
+  auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
+  auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
+  if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
+    return false;
+  assert(CastDstTy->getElementCount() == CastSrcTy->getElementCount() &&
+         "Unexpected src/dst element counts");
+
+  auto *NewShuffleDstTy =
+      FixedVectorType::get(CastSrcTy->getScalarType(), Mask.size());
+
+  // Try to replace a castop with a shuffle if the shuffle is not costly.
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+  InstructionCost OldCost =
+      TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
+                           TTI::CastContextHint::None, CostKind) +
+      TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
+                           TTI::CastContextHint::None, CostKind);
+  OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
+                                CastDstTy, Mask, CostKind);
+
+  InstructionCost NewCost = TTI.getShuffleCost(
+      TargetTransformInfo::SK_PermuteTwoSrc, CastSrcTy, Mask, CostKind);
+  NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
+                                  TTI::CastContextHint::None, CostKind);
+  if (NewCost > OldCost)
+    return false;
+
+  Value *Shuf =
+      Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0), Mask);
+  Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
+
+  // Intersect flags from the old casts.
+  if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
+    NewInst->copyIRFlags(C0);
+    NewInst->andIRFlags(C1);
+  }
+
+  replaceValue(I, *Cast);
+  return true;
+}
+
 /// Given a commutative reduction, the order of the input lanes does not alter
 /// the results. We can use this to remove certain shuffles feeding the
 /// reduction, removing the need to shuffle at all.
@@ -1986,6 +2056,7 @@ bool VectorCombine::run() {
         break;
       case Instruction::ShuffleVector:
         MadeChange |= foldShuffleOfBinops(I);
+        MadeChange |= foldShuffleOfCastops(I);
         MadeChange |= foldSelectShuffle(I);
         break;
       case Instruction::BitCast:
diff --git a/llvm/lib/WindowsDriver/MSVCPaths.cpp b/llvm/lib/WindowsDriver/MSVCPaths.cpp
index 634cfcb..a7bffbb 100644
--- a/llvm/lib/WindowsDriver/MSVCPaths.cpp
+++ b/llvm/lib/WindowsDriver/MSVCPaths.cpp
@@ -268,6 +268,7 @@ const char *archToWindowsSDKArch(Triple::ArchType Arch) {
   case Triple::ArchType::x86_64:
     return "x64";
   case Triple::ArchType::arm:
+  case Triple::ArchType::thumb:
     return "arm";
   case Triple::ArchType::aarch64:
     return "arm64";
@@ -285,6 +286,7 @@ const char *archToLegacyVCArch(Triple::ArchType Arch) {
   case Triple::ArchType::x86_64:
     return "amd64";
   case Triple::ArchType::arm:
+  case Triple::ArchType::thumb:
     return "arm";
   case Triple::ArchType::aarch64:
     return "arm64";
@@ -300,6 +302,7 @@ const char *archToDevDivInternalArch(Triple::ArchType Arch) {
   case Triple::ArchType::x86_64:
     return "amd64";
   case Triple::ArchType::arm:
+  case Triple::ArchType::thumb:
     return "arm";
   case Triple::ArchType::aarch64:
     return "arm64";
@@ -321,6 +324,7 @@ bool appendArchToWindowsSDKLibPath(int SDKMajor, SmallString<128> LibPath,
       sys::path::append(LibPath, "x64");
       break;
     case Triple::arm:
+    case Triple::thumb:
       // It is not necessary to link against Windows SDK 7.x when targeting ARM.
       return false;
     default:
diff --git a/llvm/test/Analysis/CostModel/RISCV/stepvector.ll b/llvm/test/Analysis/CostModel/RISCV/stepvector.ll
index 7d29d2c..e599955 100644
--- a/llvm/test/Analysis/CostModel/RISCV/stepvector.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/stepvector.ll
@@ -4,98 +4,60 @@
 
 define void @stepvector() {
 ; CHECK-LABEL: 'stepvector'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %zero = call <vscale x 1 x i8> @llvm.experimental.stepvector.nxv1i8()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <vscale x 4 x i8> @llvm.experimental.stepvector.nxv4i8()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call <vscale x 1 x i8> @llvm.experimental.stepvector.nxv1i8()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <vscale x 4 x i8> @llvm.experimental.stepvector.nxv4i8()
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 16 x i8> @llvm.experimental.stepvector.nxv16i8()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 32 x i8> @llvm.experimental.stepvector.nxv32i8()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 64 x i8> @llvm.experimental.stepvector.nxv64i8()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = call <vscale x 1 x i16> @llvm.experimental.stepvector.nxv1i16()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = call <vscale x 2 x i16> @llvm.experimental.stepvector.nxv2i16()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <vscale x 4 x i16> @llvm.experimental.stepvector.nxv4i16()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = call <vscale x 8 x i16> @llvm.experimental.stepvector.nxv8i16()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = call <vscale x 32 x i16> @llvm.experimental.stepvector.nxv32i16()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = call <vscale x 1 x i32> @llvm.experimental.stepvector.nxv1i32()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = call <vscale x 2 x i32> @llvm.experimental.stepvector.nxv2i32()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = call <vscale x 8 x i32> @llvm.experimental.stepvector.nxv8i32()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %35 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %37 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %38 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %39 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %40 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %41 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %42 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %43 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %44 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <vscale x 16 x i8> @llvm.experimental.stepvector.nxv16i8()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %6 = call <vscale x 32 x i8> @llvm.experimental.stepvector.nxv32i8()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %7 = call <vscale x 64 x i8> @llvm.experimental.stepvector.nxv64i8()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %8 = call <vscale x 128 x i8> @llvm.experimental.stepvector.nxv128i8()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 1 x i16> @llvm.experimental.stepvector.nxv1i16()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 2 x i16> @llvm.experimental.stepvector.nxv2i16()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = call <vscale x 4 x i16> @llvm.experimental.stepvector.nxv4i16()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %12 = call <vscale x 8 x i16> @llvm.experimental.stepvector.nxv8i16()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %13 = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %14 = call <vscale x 32 x i16> @llvm.experimental.stepvector.nxv32i16()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %15 = call <vscale x 64 x i16> @llvm.experimental.stepvector.nxv64i16()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <vscale x 1 x i32> @llvm.experimental.stepvector.nxv1i32()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <vscale x 2 x i32> @llvm.experimental.stepvector.nxv2i32()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %18 = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %19 = call <vscale x 8 x i32> @llvm.experimental.stepvector.nxv8i32()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %20 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %21 = call <vscale x 32 x i32> @llvm.experimental.stepvector.nxv32i32()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %23 = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %24 = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %25 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %26 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %zero = call <vscale x 1 x i8> @llvm.experimental.stepvector.nxv1i8()
-  %1 = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
-  %2 = call <vscale x 4 x i8> @llvm.experimental.stepvector.nxv4i8()
-  %3 = call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
-  %4 = call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
-  %5 = call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
-  %6 = call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
-  %7 = call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
-  %8 = call <vscale x 16 x i8> @llvm.experimental.stepvector.nxv16i8()
-  %9 = call <vscale x 32 x i8> @llvm.experimental.stepvector.nxv32i8()
-  %10 = call <vscale x 64 x i8> @llvm.experimental.stepvector.nxv64i8()
-  %11 = call <vscale x 1 x i16> @llvm.experimental.stepvector.nxv1i16()
-  %12 = call <vscale x 2 x i16> @llvm.experimental.stepvector.nxv2i16()
-  %13 = call <vscale x 4 x i16> @llvm.experimental.stepvector.nxv4i16()
-  %14 = call <vscale x 8 x i16> @llvm.experimental.stepvector.nxv8i16()
-  %15 = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
-  %16 = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
-  %17 = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
-  %18 = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
-  %19 = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
-  %20 = call <vscale x 32 x i16> @llvm.experimental.stepvector.nxv32i16()
-  %21 = call <vscale x 1 x i32> @llvm.experimental.stepvector.nxv1i32()
-  %22 = call <vscale x 2 x i32> @llvm.experimental.stepvector.nxv2i32()
-  %23 = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
-  %24 = call <vscale x 8 x i32> @llvm.experimental.stepvector.nxv8i32()
-  %25 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-  %26 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-  %27 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-  %28 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-  %29 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-  %30 = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
-  %31 = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-  %32 = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-  %33 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-  %34 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-  %35 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-  %36 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-  %37 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-  %38 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-  %39 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
-  %40 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
-  %41 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
-  %42 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
-  %43 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
-  %44 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+  call <vscale x 1 x i8> @llvm.experimental.stepvector.nxv1i8()
+  call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
+  call <vscale x 4 x i8> @llvm.experimental.stepvector.nxv4i8()
+  call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
+  call <vscale x 16 x i8> @llvm.experimental.stepvector.nxv16i8()
+  call <vscale x 32 x i8> @llvm.experimental.stepvector.nxv32i8()
+  call <vscale x 64 x i8> @llvm.experimental.stepvector.nxv64i8()
+  call <vscale x 128 x i8> @llvm.experimental.stepvector.nxv128i8()
+  call <vscale x 1 x i16> @llvm.experimental.stepvector.nxv1i16()
+  call <vscale x 2 x i16> @llvm.experimental.stepvector.nxv2i16()
+  call <vscale x 4 x i16> @llvm.experimental.stepvector.nxv4i16()
+  call <vscale x 8 x i16> @llvm.experimental.stepvector.nxv8i16()
+  call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
+  call <vscale x 32 x i16> @llvm.experimental.stepvector.nxv32i16()
+  call <vscale x 64 x i16> @llvm.experimental.stepvector.nxv64i16()
+  call <vscale x 1 x i32> @llvm.experimental.stepvector.nxv1i32()
+  call <vscale x 2 x i32> @llvm.experimental.stepvector.nxv2i32()
+  call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+  call <vscale x 8 x i32> @llvm.experimental.stepvector.nxv8i32()
+  call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+  call <vscale x 32 x i32> @llvm.experimental.stepvector.nxv32i32()
+  call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
+  call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+  call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+  call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
+  call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
   ret void
 }
 
@@ -107,17 +69,20 @@ declare <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
 declare <vscale x 16 x i8> @llvm.experimental.stepvector.nxv16i8()
 declare <vscale x 32 x i8> @llvm.experimental.stepvector.nxv32i8()
 declare <vscale x 64 x i8> @llvm.experimental.stepvector.nxv64i8()
+declare <vscale x 128 x i8> @llvm.experimental.stepvector.nxv128i8()
 declare <vscale x 1 x i16> @llvm.experimental.stepvector.nxv1i16()
 declare <vscale x 2 x i16> @llvm.experimental.stepvector.nxv2i16()
 declare <vscale x 4 x i16> @llvm.experimental.stepvector.nxv4i16()
 declare <vscale x 8 x i16> @llvm.experimental.stepvector.nxv8i16()
 declare <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
 declare <vscale x 32 x i16> @llvm.experimental.stepvector.nxv32i16()
+declare <vscale x 64 x i16> @llvm.experimental.stepvector.nxv64i16()
 declare <vscale x 1 x i32> @llvm.experimental.stepvector.nxv1i32()
 declare <vscale x 2 x i32> @llvm.experimental.stepvector.nxv2i32()
 declare <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
 declare <vscale x 8 x i32> @llvm.experimental.stepvector.nxv8i32()
 declare <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+declare <vscale x 32 x i32> @llvm.experimental.stepvector.nxv32i32()
 declare <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
 declare <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
 declare <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
diff --git a/llvm/test/Analysis/CostModel/X86/cast.ll b/llvm/test/Analysis/CostModel/X86/cast.ll
index 64ed9bed..47487d6 100644
--- a/llvm/test/Analysis/CostModel/X86/cast.ll
+++ b/llvm/test/Analysis/CostModel/X86/cast.ll
@@ -35,7 +35,7 @@ define i32 @add(i32 %arg) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %B = sext <4 x i1> undef to <4 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %C = trunc <4 x i32> undef to <4 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %D = zext <8 x i1> undef to <8 x i32>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %E = sext <8 x i1> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %E = sext <8 x i1> undef to <8 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %F = trunc <8 x i32> undef to <8 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %G = zext i1 undef to i32
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %H = trunc i32 undef to i1
@@ -143,7 +143,7 @@ define i32 @zext_sext(<8 x i1> %in) {
 ;
 ; AVX1-LABEL: 'zext_sext'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %Z = zext <8 x i1> %in to <8 x i32>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %S = sext <8 x i1> %in to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %S = sext <8 x i1> %in to <8 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A1 = zext <16 x i8> undef to <16 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A2 = sext <16 x i8> undef to <16 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A = sext <8 x i16> undef to <8 x i32>
@@ -343,7 +343,7 @@ define i32 @masks8(<8 x i1> %in) {
 ;
 ; AVX1-LABEL: 'masks8'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %Z = zext <8 x i1> %in to <8 x i32>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %S = sext <8 x i1> %in to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %S = sext <8 x i1> %in to <8 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'masks8'
@@ -374,7 +374,7 @@ define i32 @masks4(<4 x i1> %in) {
 ;
 ; AVX1-LABEL: 'masks4'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %Z = zext <4 x i1> %in to <4 x i64>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %S = sext <4 x i1> %in to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %S = sext <4 x i1> %in to <4 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'masks4'
diff --git a/llvm/test/Analysis/CostModel/X86/extend.ll b/llvm/test/Analysis/CostModel/X86/extend.ll
index 01efced..4a2585a 100644
--- a/llvm/test/Analysis/CostModel/X86/extend.ll
+++ b/llvm/test/Analysis/CostModel/X86/extend.ll
@@ -1962,7 +1962,7 @@ define i32 @sext_vXi1() "min-legal-vector-width"="256" {
 ; AVX1-LABEL: 'sext_vXi1'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sext i1 undef to i64
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i64 = sext <16 x i1> undef to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32i64 = sext <32 x i1> undef to <32 x i64>
@@ -1971,7 +1971,7 @@ define i32 @sext_vXi1() "min-legal-vector-width"="256" {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sext i1 undef to i32
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = sext <16 x i1> undef to <16 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32i32 = sext <32 x i1> undef to <32 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64i32 = sext <64 x i1> undef to <64 x i32>
@@ -2242,7 +2242,7 @@ define i32 @sext_vXi1() "min-legal-vector-width"="256" {
 ; BTVER2-LABEL: 'sext_vXi1'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sext i1 undef to i64
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i64 = sext <16 x i1> undef to <16 x i64>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32i64 = sext <32 x i1> undef to <32 x i64>
@@ -2251,7 +2251,7 @@ define i32 @sext_vXi1() "min-legal-vector-width"="256" {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sext i1 undef to i32
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = sext <16 x i1> undef to <16 x i32>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32i32 = sext <32 x i1> undef to <32 x i32>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64i32 = sext <64 x i1> undef to <64 x i32>
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll
new file mode 100644
index 0000000..55fdaaf
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll
@@ -0,0 +1,2413 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse2   | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx    | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2   | FileCheck %s --check-prefixes=AVX,AVX2
+;
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=skylake  | FileCheck %s --check-prefixes=AVX,SKL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
+
+define i32 @masked_load() {
+; SSE2-LABEL: 'masked_load'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_load'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_load'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_load'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_load'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+  %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+  %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+  %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+  %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+  %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+  %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+  %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+  %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+  %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+  %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+  %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+  %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+  %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+  %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+  %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+  %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+  %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+  %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+  %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+  %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+  %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+  %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_store() {
+; SSE2-LABEL: 'masked_store'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_store'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_store'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_store'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_store'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_gather() {
+; SSE2-LABEL: 'masked_gather'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_gather'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_gather'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_gather'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_gather'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_gather'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_gather'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_scatter() {
+; SSE2-LABEL: 'masked_scatter'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_scatter'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_scatter'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_scatter'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_scatter'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_expandload() {
+; SSE2-LABEL: 'masked_expandload'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_expandload'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_expandload'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_expandload'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_compressstore() {
+; SSE2-LABEL: 'masked_compressstore'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_compressstore'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_compressstore'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_compressstore'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_compressstore'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_compressstore'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+
+  call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+  call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) {
+; SSE2-LABEL: 'test1'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test1'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX-LABEL: 'test1'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test1'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) {
+; SSE2-LABEL: 'test2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX-LABEL: 'test2'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX512-LABEL: 'test2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
+  ret <4 x i32> %res
+}
+
+define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
+; SSE2-LABEL: 'test3'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test3'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test3'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test3'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v4i32.p0(<4 x i32>%val, ptr %addr, i32 4, <4 x i1>%mask)
+  ret void
+}
+
+define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) {
+; SSE2-LABEL: 'test4'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SSE42-LABEL: 'test4'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX1-LABEL: 'test4'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX2-LABEL: 'test4'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SKL-LABEL: 'test4'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX512-LABEL: 'test4'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+  %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1>%mask, <8 x float>%dst)
+  ret <8 x float> %res
+}
+
+define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) {
+; SSE2-LABEL: 'test5'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test5'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test5'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test5'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2f32.p0(<2 x float>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
+; SSE2-LABEL: 'test6'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test6'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test6'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test6'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2i32.p0(<2 x i32>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) {
+; SSE2-LABEL: 'test7'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; SSE42-LABEL: 'test7'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX-LABEL: 'test7'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX512-LABEL: 'test7'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
+  ret <2 x float> %res
+}
+
+define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
+; SSE2-LABEL: 'test8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; SSE42-LABEL: 'test8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX-LABEL: 'test8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX512-LABEL: 'test8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
+  ret <2 x i32> %res
+}
+
+define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0)  {
+; SSE2-LABEL: 'test_gather_2f64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test_gather_2f64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX1-LABEL: 'test_gather_2f64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX2-LABEL: 'test_gather_2f64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SKL-LABEL: 'test_gather_2f64'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test_gather_2f64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask2'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask2'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask2'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
+; SSE2-LABEL: 'test_scatter_16i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_16i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX1-LABEL: 'test_scatter_16i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_scatter_16i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKL-LABEL: 'test_scatter_16i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_16i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+  %imask = bitcast i16 %mask to <16 x i1>
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>%val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+  ret void
+}
+
+define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_8i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_8i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_8i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_8i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+  ret void
+}
+
+define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_4i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; KNL-LABEL: 'test_scatter_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKX-LABEL: 'test_scatter_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+  ret void
+}
+
+define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_4f32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_4f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+declare <8 x double> @llvm.masked.load.v8f64.p0(ptr, i32, <8 x i1>, <8 x double>)
+declare <7 x double> @llvm.masked.load.v7f64.p0(ptr, i32, <7 x i1>, <7 x double>)
+declare <6 x double> @llvm.masked.load.v6f64.p0(ptr, i32, <6 x i1>, <6 x double>)
+declare <5 x double> @llvm.masked.load.v5f64.p0(ptr, i32, <5 x i1>, <5 x double>)
+declare <4 x double> @llvm.masked.load.v4f64.p0(ptr, i32, <4 x i1>, <4 x double>)
+declare <3 x double> @llvm.masked.load.v3f64.p0(ptr, i32, <3 x i1>, <3 x double>)
+declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.load.v1f64.p0(ptr, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.load.v16f32.p0(ptr, i32, <16 x i1>, <16 x float>)
+declare <15 x float> @llvm.masked.load.v15f32.p0(ptr, i32, <15 x i1>, <15 x float>)
+declare <14 x float> @llvm.masked.load.v14f32.p0(ptr, i32, <14 x i1>, <14 x float>)
+declare <13 x float> @llvm.masked.load.v13f32.p0(ptr, i32, <13 x i1>, <13 x float>)
+declare <12 x float> @llvm.masked.load.v12f32.p0(ptr, i32, <12 x i1>, <12 x float>)
+declare <11 x float> @llvm.masked.load.v11f32.p0(ptr, i32, <11 x i1>, <11 x float>)
+declare <10 x float> @llvm.masked.load.v10f32.p0(ptr, i32, <10 x i1>, <10 x float>)
+declare <9 x float> @llvm.masked.load.v9f32.p0(ptr, i32, <9 x i1>, <9 x float>)
+declare <8 x float> @llvm.masked.load.v8f32.p0(ptr, i32, <8 x i1>, <8 x float>)
+declare <7 x float> @llvm.masked.load.v7f32.p0(ptr, i32, <7 x i1>, <7 x float>)
+declare <6 x float> @llvm.masked.load.v6f32.p0(ptr, i32, <6 x i1>, <6 x float>)
+declare <5 x float> @llvm.masked.load.v5f32.p0(ptr, i32, <5 x i1>, <5 x float>)
+declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>)
+declare <3 x float> @llvm.masked.load.v3f32.p0(ptr, i32, <3 x i1>, <3 x float>)
+declare <2 x float> @llvm.masked.load.v2f32.p0(ptr, i32, <2 x i1>, <2 x float>)
+declare <1 x float> @llvm.masked.load.v1f32.p0(ptr, i32, <1 x i1>, <1 x float>)
+
+declare <8 x i64> @llvm.masked.load.v8i64.p0(ptr, i32, <8 x i1>, <8 x i64>)
+declare <7 x i64> @llvm.masked.load.v7i64.p0(ptr, i32, <7 x i1>, <7 x i64>)
+declare <6 x i64> @llvm.masked.load.v6i64.p0(ptr, i32, <6 x i1>, <6 x i64>)
+declare <5 x i64> @llvm.masked.load.v5i64.p0(ptr, i32, <5 x i1>, <5 x i64>)
+declare <4 x i64> @llvm.masked.load.v4i64.p0(ptr, i32, <4 x i1>, <4 x i64>)
+declare <3 x i64> @llvm.masked.load.v3i64.p0(ptr, i32, <3 x i1>, <3 x i64>)
+declare <2 x i64> @llvm.masked.load.v2i64.p0(ptr, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.load.v1i64.p0(ptr, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>)
+declare <15 x i32> @llvm.masked.load.v15i32.p0(ptr, i32, <15 x i1>, <15 x i32>)
+declare <14 x i32> @llvm.masked.load.v14i32.p0(ptr, i32, <14 x i1>, <14 x i32>)
+declare <13 x i32> @llvm.masked.load.v13i32.p0(ptr, i32, <13 x i1>, <13 x i32>)
+declare <12 x i32> @llvm.masked.load.v12i32.p0(ptr, i32, <12 x i1>, <12 x i32>)
+declare <11 x i32> @llvm.masked.load.v11i32.p0(ptr, i32, <11 x i1>, <11 x i32>)
+declare <10 x i32> @llvm.masked.load.v10i32.p0(ptr, i32, <10 x i1>, <10 x i32>)
+declare <9 x i32> @llvm.masked.load.v9i32.p0(ptr, i32, <9 x i1>, <9 x i32>)
+declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>)
+declare <7 x i32> @llvm.masked.load.v7i32.p0(ptr, i32, <7 x i1>, <7 x i32>)
+declare <6 x i32> @llvm.masked.load.v6i32.p0(ptr, i32, <6 x i1>, <6 x i32>)
+declare <5 x i32> @llvm.masked.load.v5i32.p0(ptr, i32, <5 x i1>, <5 x i32>)
+declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
+declare <3 x i32> @llvm.masked.load.v3i32.p0(ptr, i32, <3 x i1>, <3 x i32>)
+declare <2 x i32> @llvm.masked.load.v2i32.p0(ptr, i32, <2 x i1>, <2 x i32>)
+declare <1 x i32> @llvm.masked.load.v1i32.p0(ptr, i32, <1 x i1>, <1 x i32>)
+
+declare <32 x i16> @llvm.masked.load.v32i16.p0(ptr, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.load.v16i16.p0(ptr, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.load.v64i8.p0(ptr, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.load.v32i8.p0(ptr, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.store.v8f64.p0(<8 x double>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f64.p0(<7 x double>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f64.p0(<6 x double>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f64.p0(<5 x double>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f64.p0(<4 x double>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f64.p0(<3 x double>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f64.p0(<2 x double>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f64.p0(<1 x double>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16f32.p0(<16 x float>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15f32.p0(<15 x float>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14f32.p0(<14 x float>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13f32.p0(<13 x float>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12f32.p0(<12 x float>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11f32.p0(<11 x float>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10f32.p0(<10 x float>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9f32.p0(<9 x float>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8f32.p0(<8 x float>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f32.p0(<7 x float>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f32.p0(<6 x float>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f32.p0(<5 x float>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f32.p0(<3 x float>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f32.p0(<2 x float>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f32.p0(<1 x float>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v8i64.p0(<8 x i64>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i64.p0(<7 x i64>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i64.p0(<6 x i64>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i64.p0(<5 x i64>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i64.p0(<4 x i64>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i64.p0(<3 x i64>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i64.p0(<2 x i64>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i64.p0(<1 x i64>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16i32.p0(<16 x i32>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15i32.p0(<15 x i32>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14i32.p0(<14 x i32>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13i32.p0(<13 x i32>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12i32.p0(<12 x i32>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11i32.p0(<11 x i32>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10i32.p0(<10 x i32>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9i32.p0(<9 x i32>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8i32.p0(<8 x i32>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i32.p0(<7 x i32>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i32.p0(<6 x i32>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i32.p0(<5 x i32>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i32.p0(<3 x i32>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i32.p0(<2 x i32>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i32.p0(<1 x i32>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v32i16.p0(<32 x i16>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i16.p0(<16 x i16>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32, <4 x i1>)
+
+declare void @llvm.masked.store.v64i8.p0(<64 x i8>, ptr, i32, <64 x i1>)
+declare void @llvm.masked.store.v32i8.p0(<32 x i8>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr>, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.scatter.v8f64.v8p0(<8 x double>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f64.v4p0(<4 x double>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1f64.v1p0(<1 x double>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f32.v4p0(<4 x float>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f32.v2p0(<2 x float>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v8i64.v8p0(<8 x i64>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i64.v4p0(<4 x i64>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i64.v2p0(<2 x i64>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1i64.v1p0(<1 x i64>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v32i16.v32p0(<32 x i16>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i16.v16p0(<16 x i16>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i16.v8p0(<8 x i16>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i16.v4p0(<4 x i16>, <4 x ptr>, i32, <4 x i1>)
+
+declare void @llvm.masked.scatter.v64i8.v64p0(<64 x i8>, <64 x ptr>, i32, <64 x i1>)
+declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i8.v8p0(<8 x i8>, <8 x ptr>, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.expandload.v8f64(ptr, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.expandload.v4f64(ptr, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.expandload.v2f64(ptr, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.expandload.v1f64(ptr, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.expandload.v4f32(ptr, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.expandload.v2f32(ptr, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.expandload.v8i64(ptr, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.expandload.v4i64(ptr, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.expandload.v2i64(ptr, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.expandload.v1i64(ptr, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.expandload.v8i32(ptr, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.expandload.v4i32(ptr, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.expandload.v2i32(ptr, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.expandload.v32i16(ptr, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.expandload.v16i16(ptr, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.expandload.v8i16(ptr, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.expandload.v4i16(ptr, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.expandload.v64i8(ptr, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.expandload.v32i8(ptr, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.expandload.v16i8(ptr, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.expandload.v8i8(ptr, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.compressstore.v8f64(<8 x double>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f64(<4 x double>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f64(<2 x double>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1f64(<1 x double>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16f32(<16 x float>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8f32(<8 x float>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f32(<4 x float>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f32(<2 x float>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1i64(<1 x i64>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16i32(<16 x i32>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i32(<4 x i32>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i32(<2 x i32>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v32i16(<32 x i16>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i16(<16 x i16>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i16(<8 x i16>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i16(<4 x i16>, ptr, <4 x i1>)
+
+declare void @llvm.masked.compressstore.v64i8(<64 x i8>, ptr, <64 x i1>)
+declare void @llvm.masked.compressstore.v32i8(<32 x i8>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i8(<16 x i8>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>)
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
index 897344d..ad56c28 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse4.2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=SSE42
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,AVX2
-;
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skylake -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,SKL
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=knl -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX512,KNL
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skx -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX512,SKX
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2   | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx    | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2   | FileCheck %s --check-prefixes=AVX,AVX2
+;
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=skylake  | FileCheck %s --check-prefixes=AVX,SKL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
 
 define i32 @masked_load() {
 ; SSE2-LABEL: 'masked_load'
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
index 5f22b2e..c7e7c46 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse4.2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=SSE42
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,AVX2
-;
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skylake -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,SKL
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=knl -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX512,KNL
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skx -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX512,SKX
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2   | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx    | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2   | FileCheck %s --check-prefixes=AVX,AVX2
+;
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=skylake  | FileCheck %s --check-prefixes=AVX,SKL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
 
 define i32 @masked_load() {
 ; SSE2-LABEL: 'masked_load'
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll
new file mode 100644
index 0000000..edb05ad
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll
@@ -0,0 +1,2413 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse2   | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx    | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2   | FileCheck %s --check-prefixes=AVX,AVX2
+;
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=skylake  | FileCheck %s --check-prefixes=AVX,SKL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
+
+define i32 @masked_load() {
+; SSE2-LABEL: 'masked_load'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_load'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_load'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_load'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_load'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+  %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+  %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+  %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+  %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+  %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+  %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+  %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+  %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+  %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+  %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+  %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+  %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+  %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+  %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+  %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+  %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+  %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+  %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+  %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+  %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+  %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+  %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_store() {
+; SSE2-LABEL: 'masked_store'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_store'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_store'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_store'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_store'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_gather() {
+; SSE2-LABEL: 'masked_gather'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_gather'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_gather'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_gather'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_gather'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_gather'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_gather'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_scatter() {
+; SSE2-LABEL: 'masked_scatter'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_scatter'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_scatter'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_scatter'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_scatter'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_expandload() {
+; SSE2-LABEL: 'masked_expandload'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_expandload'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_expandload'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_expandload'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_compressstore() {
+; SSE2-LABEL: 'masked_compressstore'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_compressstore'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_compressstore'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_compressstore'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_compressstore'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_compressstore'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+
+  call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+  call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) {
+; SSE2-LABEL: 'test1'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test1'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX-LABEL: 'test1'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test1'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) {
+; SSE2-LABEL: 'test2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX-LABEL: 'test2'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX512-LABEL: 'test2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
+  ret <4 x i32> %res
+}
+
+define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
+; SSE2-LABEL: 'test3'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test3'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test3'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test3'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v4i32.p0(<4 x i32>%val, ptr %addr, i32 4, <4 x i1>%mask)
+  ret void
+}
+
+define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) {
+; SSE2-LABEL: 'test4'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SSE42-LABEL: 'test4'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX1-LABEL: 'test4'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX2-LABEL: 'test4'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SKL-LABEL: 'test4'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX512-LABEL: 'test4'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+  %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1>%mask, <8 x float>%dst)
+  ret <8 x float> %res
+}
+
+define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) {
+; SSE2-LABEL: 'test5'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test5'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test5'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test5'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2f32.p0(<2 x float>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
+; SSE2-LABEL: 'test6'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test6'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test6'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test6'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2i32.p0(<2 x i32>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) {
+; SSE2-LABEL: 'test7'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; SSE42-LABEL: 'test7'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX-LABEL: 'test7'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX512-LABEL: 'test7'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
+  ret <2 x float> %res
+}
+
+define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
+; SSE2-LABEL: 'test8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; SSE42-LABEL: 'test8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX-LABEL: 'test8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX512-LABEL: 'test8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
+  ret <2 x i32> %res
+}
+
+define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0)  {
+; SSE2-LABEL: 'test_gather_2f64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test_gather_2f64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX1-LABEL: 'test_gather_2f64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX2-LABEL: 'test_gather_2f64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SKL-LABEL: 'test_gather_2f64'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test_gather_2f64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask2'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask2'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask2'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
+; SSE2-LABEL: 'test_scatter_16i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_16i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX1-LABEL: 'test_scatter_16i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_scatter_16i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKL-LABEL: 'test_scatter_16i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_16i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+  %imask = bitcast i16 %mask to <16 x i1>
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>%val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+  ret void
+}
+
+define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_8i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_8i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_8i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_8i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+  ret void
+}
+
+define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_4i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; KNL-LABEL: 'test_scatter_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKX-LABEL: 'test_scatter_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+  ret void
+}
+
+define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_4f32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_4f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+declare <8 x double> @llvm.masked.load.v8f64.p0(ptr, i32, <8 x i1>, <8 x double>)
+declare <7 x double> @llvm.masked.load.v7f64.p0(ptr, i32, <7 x i1>, <7 x double>)
+declare <6 x double> @llvm.masked.load.v6f64.p0(ptr, i32, <6 x i1>, <6 x double>)
+declare <5 x double> @llvm.masked.load.v5f64.p0(ptr, i32, <5 x i1>, <5 x double>)
+declare <4 x double> @llvm.masked.load.v4f64.p0(ptr, i32, <4 x i1>, <4 x double>)
+declare <3 x double> @llvm.masked.load.v3f64.p0(ptr, i32, <3 x i1>, <3 x double>)
+declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.load.v1f64.p0(ptr, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.load.v16f32.p0(ptr, i32, <16 x i1>, <16 x float>)
+declare <15 x float> @llvm.masked.load.v15f32.p0(ptr, i32, <15 x i1>, <15 x float>)
+declare <14 x float> @llvm.masked.load.v14f32.p0(ptr, i32, <14 x i1>, <14 x float>)
+declare <13 x float> @llvm.masked.load.v13f32.p0(ptr, i32, <13 x i1>, <13 x float>)
+declare <12 x float> @llvm.masked.load.v12f32.p0(ptr, i32, <12 x i1>, <12 x float>)
+declare <11 x float> @llvm.masked.load.v11f32.p0(ptr, i32, <11 x i1>, <11 x float>)
+declare <10 x float> @llvm.masked.load.v10f32.p0(ptr, i32, <10 x i1>, <10 x float>)
+declare <9 x float> @llvm.masked.load.v9f32.p0(ptr, i32, <9 x i1>, <9 x float>)
+declare <8 x float> @llvm.masked.load.v8f32.p0(ptr, i32, <8 x i1>, <8 x float>)
+declare <7 x float> @llvm.masked.load.v7f32.p0(ptr, i32, <7 x i1>, <7 x float>)
+declare <6 x float> @llvm.masked.load.v6f32.p0(ptr, i32, <6 x i1>, <6 x float>)
+declare <5 x float> @llvm.masked.load.v5f32.p0(ptr, i32, <5 x i1>, <5 x float>)
+declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>)
+declare <3 x float> @llvm.masked.load.v3f32.p0(ptr, i32, <3 x i1>, <3 x float>)
+declare <2 x float> @llvm.masked.load.v2f32.p0(ptr, i32, <2 x i1>, <2 x float>)
+declare <1 x float> @llvm.masked.load.v1f32.p0(ptr, i32, <1 x i1>, <1 x float>)
+
+declare <8 x i64> @llvm.masked.load.v8i64.p0(ptr, i32, <8 x i1>, <8 x i64>)
+declare <7 x i64> @llvm.masked.load.v7i64.p0(ptr, i32, <7 x i1>, <7 x i64>)
+declare <6 x i64> @llvm.masked.load.v6i64.p0(ptr, i32, <6 x i1>, <6 x i64>)
+declare <5 x i64> @llvm.masked.load.v5i64.p0(ptr, i32, <5 x i1>, <5 x i64>)
+declare <4 x i64> @llvm.masked.load.v4i64.p0(ptr, i32, <4 x i1>, <4 x i64>)
+declare <3 x i64> @llvm.masked.load.v3i64.p0(ptr, i32, <3 x i1>, <3 x i64>)
+declare <2 x i64> @llvm.masked.load.v2i64.p0(ptr, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.load.v1i64.p0(ptr, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>)
+declare <15 x i32> @llvm.masked.load.v15i32.p0(ptr, i32, <15 x i1>, <15 x i32>)
+declare <14 x i32> @llvm.masked.load.v14i32.p0(ptr, i32, <14 x i1>, <14 x i32>)
+declare <13 x i32> @llvm.masked.load.v13i32.p0(ptr, i32, <13 x i1>, <13 x i32>)
+declare <12 x i32> @llvm.masked.load.v12i32.p0(ptr, i32, <12 x i1>, <12 x i32>)
+declare <11 x i32> @llvm.masked.load.v11i32.p0(ptr, i32, <11 x i1>, <11 x i32>)
+declare <10 x i32> @llvm.masked.load.v10i32.p0(ptr, i32, <10 x i1>, <10 x i32>)
+declare <9 x i32> @llvm.masked.load.v9i32.p0(ptr, i32, <9 x i1>, <9 x i32>)
+declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>)
+declare <7 x i32> @llvm.masked.load.v7i32.p0(ptr, i32, <7 x i1>, <7 x i32>)
+declare <6 x i32> @llvm.masked.load.v6i32.p0(ptr, i32, <6 x i1>, <6 x i32>)
+declare <5 x i32> @llvm.masked.load.v5i32.p0(ptr, i32, <5 x i1>, <5 x i32>)
+declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
+declare <3 x i32> @llvm.masked.load.v3i32.p0(ptr, i32, <3 x i1>, <3 x i32>)
+declare <2 x i32> @llvm.masked.load.v2i32.p0(ptr, i32, <2 x i1>, <2 x i32>)
+declare <1 x i32> @llvm.masked.load.v1i32.p0(ptr, i32, <1 x i1>, <1 x i32>)
+
+declare <32 x i16> @llvm.masked.load.v32i16.p0(ptr, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.load.v16i16.p0(ptr, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.load.v64i8.p0(ptr, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.load.v32i8.p0(ptr, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.store.v8f64.p0(<8 x double>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f64.p0(<7 x double>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f64.p0(<6 x double>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f64.p0(<5 x double>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f64.p0(<4 x double>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f64.p0(<3 x double>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f64.p0(<2 x double>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f64.p0(<1 x double>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16f32.p0(<16 x float>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15f32.p0(<15 x float>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14f32.p0(<14 x float>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13f32.p0(<13 x float>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12f32.p0(<12 x float>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11f32.p0(<11 x float>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10f32.p0(<10 x float>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9f32.p0(<9 x float>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8f32.p0(<8 x float>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f32.p0(<7 x float>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f32.p0(<6 x float>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f32.p0(<5 x float>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f32.p0(<3 x float>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f32.p0(<2 x float>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f32.p0(<1 x float>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v8i64.p0(<8 x i64>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i64.p0(<7 x i64>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i64.p0(<6 x i64>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i64.p0(<5 x i64>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i64.p0(<4 x i64>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i64.p0(<3 x i64>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i64.p0(<2 x i64>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i64.p0(<1 x i64>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16i32.p0(<16 x i32>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15i32.p0(<15 x i32>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14i32.p0(<14 x i32>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13i32.p0(<13 x i32>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12i32.p0(<12 x i32>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11i32.p0(<11 x i32>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10i32.p0(<10 x i32>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9i32.p0(<9 x i32>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8i32.p0(<8 x i32>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i32.p0(<7 x i32>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i32.p0(<6 x i32>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i32.p0(<5 x i32>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i32.p0(<3 x i32>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i32.p0(<2 x i32>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i32.p0(<1 x i32>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v32i16.p0(<32 x i16>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i16.p0(<16 x i16>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32, <4 x i1>)
+
+declare void @llvm.masked.store.v64i8.p0(<64 x i8>, ptr, i32, <64 x i1>)
+declare void @llvm.masked.store.v32i8.p0(<32 x i8>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr>, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.scatter.v8f64.v8p0(<8 x double>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f64.v4p0(<4 x double>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1f64.v1p0(<1 x double>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f32.v4p0(<4 x float>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f32.v2p0(<2 x float>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v8i64.v8p0(<8 x i64>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i64.v4p0(<4 x i64>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i64.v2p0(<2 x i64>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1i64.v1p0(<1 x i64>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v32i16.v32p0(<32 x i16>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i16.v16p0(<16 x i16>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i16.v8p0(<8 x i16>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i16.v4p0(<4 x i16>, <4 x ptr>, i32, <4 x i1>)
+
+declare void @llvm.masked.scatter.v64i8.v64p0(<64 x i8>, <64 x ptr>, i32, <64 x i1>)
+declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i8.v8p0(<8 x i8>, <8 x ptr>, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.expandload.v8f64(ptr, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.expandload.v4f64(ptr, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.expandload.v2f64(ptr, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.expandload.v1f64(ptr, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.expandload.v4f32(ptr, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.expandload.v2f32(ptr, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.expandload.v8i64(ptr, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.expandload.v4i64(ptr, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.expandload.v2i64(ptr, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.expandload.v1i64(ptr, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.expandload.v8i32(ptr, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.expandload.v4i32(ptr, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.expandload.v2i32(ptr, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.expandload.v32i16(ptr, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.expandload.v16i16(ptr, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.expandload.v8i16(ptr, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.expandload.v4i16(ptr, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.expandload.v64i8(ptr, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.expandload.v32i8(ptr, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.expandload.v16i8(ptr, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.expandload.v8i8(ptr, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.compressstore.v8f64(<8 x double>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f64(<4 x double>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f64(<2 x double>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1f64(<1 x double>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16f32(<16 x float>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8f32(<8 x float>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f32(<4 x float>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f32(<2 x float>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1i64(<1 x i64>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16i32(<16 x i32>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i32(<4 x i32>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i32(<2 x i32>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v32i16(<32 x i16>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i16(<16 x i16>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i16(<8 x i16>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i16(<4 x i16>, ptr, <4 x i1>)
+
+declare void @llvm.masked.compressstore.v64i8(<64 x i8>, ptr, <64 x i1>)
+declare void @llvm.masked.compressstore.v32i8(<32 x i8>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i8(<16 x i8>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>)
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll
new file mode 100644
index 0000000..3ebd9cc
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll
@@ -0,0 +1,2413 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse2   | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx    | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2   | FileCheck %s --check-prefixes=AVX,AVX2
+;
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=skylake  | FileCheck %s --check-prefixes=AVX,SKL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
+
+define i32 @masked_load() {
+; SSE2-LABEL: 'masked_load'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_load'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_load'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_load'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_load'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+  %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+  %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+  %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+  %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+  %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+  %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+  %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+  %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+  %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+  %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+  %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+  %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+  %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+  %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+  %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+  %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+  %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+  %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+  %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+  %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+  %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+  %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_store() {
+; SSE2-LABEL: 'masked_store'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_store'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_store'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_store'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_store'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_gather() {
+; SSE2-LABEL: 'masked_gather'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_gather'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_gather'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_gather'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_gather'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_gather'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_gather'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_scatter() {
+; SSE2-LABEL: 'masked_scatter'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_scatter'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_scatter'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_scatter'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_scatter'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_expandload() {
+; SSE2-LABEL: 'masked_expandload'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_expandload'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_expandload'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_expandload'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_compressstore() {
+; SSE2-LABEL: 'masked_compressstore'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_compressstore'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_compressstore'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_compressstore'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_compressstore'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_compressstore'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+
+  call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+  call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) {
+; SSE2-LABEL: 'test1'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test1'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX-LABEL: 'test1'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test1'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) {
+; SSE2-LABEL: 'test2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX-LABEL: 'test2'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX512-LABEL: 'test2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
+  ret <4 x i32> %res
+}
+
+define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
+; SSE2-LABEL: 'test3'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test3'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test3'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test3'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v4i32.p0(<4 x i32>%val, ptr %addr, i32 4, <4 x i1>%mask)
+  ret void
+}
+
+define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) {
+; SSE2-LABEL: 'test4'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SSE42-LABEL: 'test4'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX1-LABEL: 'test4'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX2-LABEL: 'test4'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SKL-LABEL: 'test4'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX512-LABEL: 'test4'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+  %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1>%mask, <8 x float>%dst)
+  ret <8 x float> %res
+}
+
+define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) {
+; SSE2-LABEL: 'test5'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test5'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test5'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test5'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2f32.p0(<2 x float>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
+; SSE2-LABEL: 'test6'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test6'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test6'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test6'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2i32.p0(<2 x i32>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) {
+; SSE2-LABEL: 'test7'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; SSE42-LABEL: 'test7'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX-LABEL: 'test7'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX512-LABEL: 'test7'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
+  ret <2 x float> %res
+}
+
+define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
+; SSE2-LABEL: 'test8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; SSE42-LABEL: 'test8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX-LABEL: 'test8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX512-LABEL: 'test8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
+  ret <2 x i32> %res
+}
+
+define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0)  {
+; SSE2-LABEL: 'test_gather_2f64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test_gather_2f64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX1-LABEL: 'test_gather_2f64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX2-LABEL: 'test_gather_2f64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SKL-LABEL: 'test_gather_2f64'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test_gather_2f64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask2'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask2'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask2'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
+; SSE2-LABEL: 'test_scatter_16i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_16i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX1-LABEL: 'test_scatter_16i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_scatter_16i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKL-LABEL: 'test_scatter_16i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_16i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+  %imask = bitcast i16 %mask to <16 x i1>
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>%val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+  ret void
+}
+
+define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_8i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_8i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_8i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_8i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+  ret void
+}
+
+define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_4i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; KNL-LABEL: 'test_scatter_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKX-LABEL: 'test_scatter_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+  ret void
+}
+
+define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_4f32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_4f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+declare <8 x double> @llvm.masked.load.v8f64.p0(ptr, i32, <8 x i1>, <8 x double>)
+declare <7 x double> @llvm.masked.load.v7f64.p0(ptr, i32, <7 x i1>, <7 x double>)
+declare <6 x double> @llvm.masked.load.v6f64.p0(ptr, i32, <6 x i1>, <6 x double>)
+declare <5 x double> @llvm.masked.load.v5f64.p0(ptr, i32, <5 x i1>, <5 x double>)
+declare <4 x double> @llvm.masked.load.v4f64.p0(ptr, i32, <4 x i1>, <4 x double>)
+declare <3 x double> @llvm.masked.load.v3f64.p0(ptr, i32, <3 x i1>, <3 x double>)
+declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.load.v1f64.p0(ptr, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.load.v16f32.p0(ptr, i32, <16 x i1>, <16 x float>)
+declare <15 x float> @llvm.masked.load.v15f32.p0(ptr, i32, <15 x i1>, <15 x float>)
+declare <14 x float> @llvm.masked.load.v14f32.p0(ptr, i32, <14 x i1>, <14 x float>)
+declare <13 x float> @llvm.masked.load.v13f32.p0(ptr, i32, <13 x i1>, <13 x float>)
+declare <12 x float> @llvm.masked.load.v12f32.p0(ptr, i32, <12 x i1>, <12 x float>)
+declare <11 x float> @llvm.masked.load.v11f32.p0(ptr, i32, <11 x i1>, <11 x float>)
+declare <10 x float> @llvm.masked.load.v10f32.p0(ptr, i32, <10 x i1>, <10 x float>)
+declare <9 x float> @llvm.masked.load.v9f32.p0(ptr, i32, <9 x i1>, <9 x float>)
+declare <8 x float> @llvm.masked.load.v8f32.p0(ptr, i32, <8 x i1>, <8 x float>)
+declare <7 x float> @llvm.masked.load.v7f32.p0(ptr, i32, <7 x i1>, <7 x float>)
+declare <6 x float> @llvm.masked.load.v6f32.p0(ptr, i32, <6 x i1>, <6 x float>)
+declare <5 x float> @llvm.masked.load.v5f32.p0(ptr, i32, <5 x i1>, <5 x float>)
+declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>)
+declare <3 x float> @llvm.masked.load.v3f32.p0(ptr, i32, <3 x i1>, <3 x float>)
+declare <2 x float> @llvm.masked.load.v2f32.p0(ptr, i32, <2 x i1>, <2 x float>)
+declare <1 x float> @llvm.masked.load.v1f32.p0(ptr, i32, <1 x i1>, <1 x float>)
+
+declare <8 x i64> @llvm.masked.load.v8i64.p0(ptr, i32, <8 x i1>, <8 x i64>)
+declare <7 x i64> @llvm.masked.load.v7i64.p0(ptr, i32, <7 x i1>, <7 x i64>)
+declare <6 x i64> @llvm.masked.load.v6i64.p0(ptr, i32, <6 x i1>, <6 x i64>)
+declare <5 x i64> @llvm.masked.load.v5i64.p0(ptr, i32, <5 x i1>, <5 x i64>)
+declare <4 x i64> @llvm.masked.load.v4i64.p0(ptr, i32, <4 x i1>, <4 x i64>)
+declare <3 x i64> @llvm.masked.load.v3i64.p0(ptr, i32, <3 x i1>, <3 x i64>)
+declare <2 x i64> @llvm.masked.load.v2i64.p0(ptr, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.load.v1i64.p0(ptr, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>)
+declare <15 x i32> @llvm.masked.load.v15i32.p0(ptr, i32, <15 x i1>, <15 x i32>)
+declare <14 x i32> @llvm.masked.load.v14i32.p0(ptr, i32, <14 x i1>, <14 x i32>)
+declare <13 x i32> @llvm.masked.load.v13i32.p0(ptr, i32, <13 x i1>, <13 x i32>)
+declare <12 x i32> @llvm.masked.load.v12i32.p0(ptr, i32, <12 x i1>, <12 x i32>)
+declare <11 x i32> @llvm.masked.load.v11i32.p0(ptr, i32, <11 x i1>, <11 x i32>)
+declare <10 x i32> @llvm.masked.load.v10i32.p0(ptr, i32, <10 x i1>, <10 x i32>)
+declare <9 x i32> @llvm.masked.load.v9i32.p0(ptr, i32, <9 x i1>, <9 x i32>)
+declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>)
+declare <7 x i32> @llvm.masked.load.v7i32.p0(ptr, i32, <7 x i1>, <7 x i32>)
+declare <6 x i32> @llvm.masked.load.v6i32.p0(ptr, i32, <6 x i1>, <6 x i32>)
+declare <5 x i32> @llvm.masked.load.v5i32.p0(ptr, i32, <5 x i1>, <5 x i32>)
+declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
+declare <3 x i32> @llvm.masked.load.v3i32.p0(ptr, i32, <3 x i1>, <3 x i32>)
+declare <2 x i32> @llvm.masked.load.v2i32.p0(ptr, i32, <2 x i1>, <2 x i32>)
+declare <1 x i32> @llvm.masked.load.v1i32.p0(ptr, i32, <1 x i1>, <1 x i32>)
+
+declare <32 x i16> @llvm.masked.load.v32i16.p0(ptr, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.load.v16i16.p0(ptr, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.load.v64i8.p0(ptr, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.load.v32i8.p0(ptr, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.store.v8f64.p0(<8 x double>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f64.p0(<7 x double>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f64.p0(<6 x double>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f64.p0(<5 x double>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f64.p0(<4 x double>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f64.p0(<3 x double>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f64.p0(<2 x double>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f64.p0(<1 x double>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16f32.p0(<16 x float>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15f32.p0(<15 x float>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14f32.p0(<14 x float>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13f32.p0(<13 x float>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12f32.p0(<12 x float>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11f32.p0(<11 x float>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10f32.p0(<10 x float>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9f32.p0(<9 x float>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8f32.p0(<8 x float>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f32.p0(<7 x float>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f32.p0(<6 x float>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f32.p0(<5 x float>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f32.p0(<3 x float>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f32.p0(<2 x float>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f32.p0(<1 x float>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v8i64.p0(<8 x i64>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i64.p0(<7 x i64>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i64.p0(<6 x i64>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i64.p0(<5 x i64>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i64.p0(<4 x i64>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i64.p0(<3 x i64>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i64.p0(<2 x i64>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i64.p0(<1 x i64>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16i32.p0(<16 x i32>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15i32.p0(<15 x i32>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14i32.p0(<14 x i32>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13i32.p0(<13 x i32>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12i32.p0(<12 x i32>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11i32.p0(<11 x i32>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10i32.p0(<10 x i32>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9i32.p0(<9 x i32>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8i32.p0(<8 x i32>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i32.p0(<7 x i32>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i32.p0(<6 x i32>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i32.p0(<5 x i32>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i32.p0(<3 x i32>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i32.p0(<2 x i32>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i32.p0(<1 x i32>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v32i16.p0(<32 x i16>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i16.p0(<16 x i16>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32, <4 x i1>)
+
+declare void @llvm.masked.store.v64i8.p0(<64 x i8>, ptr, i32, <64 x i1>)
+declare void @llvm.masked.store.v32i8.p0(<32 x i8>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr>, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.scatter.v8f64.v8p0(<8 x double>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f64.v4p0(<4 x double>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1f64.v1p0(<1 x double>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f32.v4p0(<4 x float>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f32.v2p0(<2 x float>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v8i64.v8p0(<8 x i64>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i64.v4p0(<4 x i64>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i64.v2p0(<2 x i64>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1i64.v1p0(<1 x i64>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v32i16.v32p0(<32 x i16>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i16.v16p0(<16 x i16>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i16.v8p0(<8 x i16>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i16.v4p0(<4 x i16>, <4 x ptr>, i32, <4 x i1>)
+
+declare void @llvm.masked.scatter.v64i8.v64p0(<64 x i8>, <64 x ptr>, i32, <64 x i1>)
+declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i8.v8p0(<8 x i8>, <8 x ptr>, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.expandload.v8f64(ptr, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.expandload.v4f64(ptr, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.expandload.v2f64(ptr, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.expandload.v1f64(ptr, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.expandload.v4f32(ptr, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.expandload.v2f32(ptr, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.expandload.v8i64(ptr, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.expandload.v4i64(ptr, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.expandload.v2i64(ptr, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.expandload.v1i64(ptr, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.expandload.v8i32(ptr, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.expandload.v4i32(ptr, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.expandload.v2i32(ptr, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.expandload.v32i16(ptr, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.expandload.v16i16(ptr, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.expandload.v8i16(ptr, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.expandload.v4i16(ptr, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.expandload.v64i8(ptr, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.expandload.v32i8(ptr, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.expandload.v16i8(ptr, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.expandload.v8i8(ptr, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.compressstore.v8f64(<8 x double>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f64(<4 x double>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f64(<2 x double>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1f64(<1 x double>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16f32(<16 x float>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8f32(<8 x float>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f32(<4 x float>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f32(<2 x float>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1i64(<1 x i64>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16i32(<16 x i32>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i32(<4 x i32>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i32(<2 x i32>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v32i16(<32 x i16>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i16(<16 x i16>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i16(<8 x i16>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i16(<4 x i16>, ptr, <4 x i1>)
+
+declare void @llvm.masked.compressstore.v64i8(<64 x i8>, ptr, <64 x i1>)
+declare void @llvm.masked.compressstore.v32i8(<32 x i8>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i8(<16 x i8>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index ac3c47c..200e9d1 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -395,6 +395,7 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_SADDSAT (opcode {{[0-9]+}}): 1 type index, 0 imm indices
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_USUBSAT (opcode {{[0-9]+}}): 1 type index, 0 imm indices
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
index 499c08f..7921de6 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
@@ -15,7 +15,7 @@
   define void @mul_wrong_pow_2(ptr %addr) { ret void }
   define void @more_than_one_use_shl_1(ptr %addr) { ret void }
   define void @more_than_one_use_shl_2(ptr %addr) { ret void }
-  define void @more_than_one_use_shl_lsl_fast(ptr %addr) #1 { ret void }
+  define void @more_than_one_use_shl_lsl_fast(ptr %addr) { ret void }
   define void @more_than_one_use_shl_lsl_slow(ptr %addr) { ret void }
   define void @more_than_one_use_shl_minsize(ptr %addr) #0 { ret void }
   define void @ldrwrox(ptr %addr) { ret void }
@@ -24,7 +24,6 @@
   define void @ldbbrox(ptr %addr) { ret void }
   define void @ldrqrox(ptr %addr) { ret void }
   attributes #0 = { optsize }
-  attributes #1 = { "target-features"="+addr-lsl-fast" }
 ...
 
 ---
@@ -478,11 +477,10 @@ body:             |
     ; CHECK: liveins: $x0, $x1, $x2
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
-    ; CHECK-NEXT: [[ADDXrs:%[0-9]+]]:gpr64common = ADDXrs [[COPY1]], [[COPY]], 3
-    ; CHECK-NEXT: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrs]], 0 :: (load (s64) from %ir.addr)
-    ; CHECK-NEXT: [[LDRXui1:%[0-9]+]]:gpr64 = LDRXui [[ADDXrs]], 0 :: (load (s64) from %ir.addr)
-    ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXui]], [[LDRXui1]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK-NEXT: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: [[LDRXroX1:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[LDRXroX1]]
     ; CHECK-NEXT: $x2 = COPY [[ADDXrr]]
     ; CHECK-NEXT: RET_ReallyLR implicit $x2
     %0:gpr(s64) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
index 59cd87f..022aaea 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK0
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+addr-lsl-fast | FileCheck %s --check-prefixes=CHECK,CHECK3
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+addr-lsl-slow-14 | FileCheck %s --check-prefixes=CHECK,CHECK0
+; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK3
 
 %struct.a = type [256 x i16]
 %struct.b = type [256 x i32]
@@ -49,36 +49,20 @@ define i16 @halfword(ptr %ctx, i32 %xor72) nounwind {
 }
 
 define i32 @word(ptr %ctx, i32 %xor72) nounwind {
-; CHECK0-LABEL: word:
-; CHECK0:       // %bb.0:
-; CHECK0-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; CHECK0-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK0-NEXT:    ubfx x8, x1, #9, #8
-; CHECK0-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK0-NEXT:    mov x19, x0
-; CHECK0-NEXT:    lsl x21, x8, #2
-; CHECK0-NEXT:    ldr w20, [x0, x21]
-; CHECK0-NEXT:    bl foo
-; CHECK0-NEXT:    mov w0, w20
-; CHECK0-NEXT:    str w20, [x19, x21]
-; CHECK0-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
-; CHECK0-NEXT:    ret
-;
-; CHECK3-LABEL: word:
-; CHECK3:       // %bb.0:
-; CHECK3-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; CHECK3-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK3-NEXT:    ubfx x21, x1, #9, #8
-; CHECK3-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK3-NEXT:    mov x19, x0
-; CHECK3-NEXT:    ldr w20, [x0, x21, lsl #2]
-; CHECK3-NEXT:    bl foo
-; CHECK3-NEXT:    mov w0, w20
-; CHECK3-NEXT:    str w20, [x19, x21, lsl #2]
-; CHECK3-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK3-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
-; CHECK3-NEXT:    ret
+; CHECK-LABEL: word:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ubfx x21, x1, #9, #8
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    ldr w20, [x0, x21, lsl #2]
+; CHECK-NEXT:    bl foo
+; CHECK-NEXT:    mov w0, w20
+; CHECK-NEXT:    str w20, [x19, x21, lsl #2]
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
   %shr81 = lshr i32 %xor72, 9
   %conv82 = zext i32 %shr81 to i64
   %idxprom83 = and i64 %conv82, 255
@@ -90,36 +74,20 @@ define i32 @word(ptr %ctx, i32 %xor72) nounwind {
 }
 
 define i64 @doubleword(ptr %ctx, i32 %xor72) nounwind {
-; CHECK0-LABEL: doubleword:
-; CHECK0:       // %bb.0:
-; CHECK0-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; CHECK0-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK0-NEXT:    ubfx x8, x1, #9, #8
-; CHECK0-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK0-NEXT:    mov x19, x0
-; CHECK0-NEXT:    lsl x21, x8, #3
-; CHECK0-NEXT:    ldr x20, [x0, x21]
-; CHECK0-NEXT:    bl foo
-; CHECK0-NEXT:    mov x0, x20
-; CHECK0-NEXT:    str x20, [x19, x21]
-; CHECK0-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
-; CHECK0-NEXT:    ret
-;
-; CHECK3-LABEL: doubleword:
-; CHECK3:       // %bb.0:
-; CHECK3-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; CHECK3-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK3-NEXT:    ubfx x21, x1, #9, #8
-; CHECK3-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK3-NEXT:    mov x19, x0
-; CHECK3-NEXT:    ldr x20, [x0, x21, lsl #3]
-; CHECK3-NEXT:    bl foo
-; CHECK3-NEXT:    mov x0, x20
-; CHECK3-NEXT:    str x20, [x19, x21, lsl #3]
-; CHECK3-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK3-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
-; CHECK3-NEXT:    ret
+; CHECK-LABEL: doubleword:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ubfx x21, x1, #9, #8
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    ldr x20, [x0, x21, lsl #3]
+; CHECK-NEXT:    bl foo
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    str x20, [x19, x21, lsl #3]
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
   %shr81 = lshr i32 %xor72, 9
   %conv82 = zext i32 %shr81 to i64
   %idxprom83 = and i64 %conv82, 255
@@ -163,20 +131,12 @@ endbb:
 }
 
 define i64 @gep3(ptr %p, i64 %b) {
-; CHECK0-LABEL: gep3:
-; CHECK0:       // %bb.0:
-; CHECK0-NEXT:    lsl x9, x1, #3
-; CHECK0-NEXT:    mov x8, x0
-; CHECK0-NEXT:    ldr x0, [x0, x9]
-; CHECK0-NEXT:    str x1, [x8, x9]
-; CHECK0-NEXT:    ret
-;
-; CHECK3-LABEL: gep3:
-; CHECK3:       // %bb.0:
-; CHECK3-NEXT:    mov x8, x0
-; CHECK3-NEXT:    ldr x0, [x0, x1, lsl #3]
-; CHECK3-NEXT:    str x1, [x8, x1, lsl #3]
-; CHECK3-NEXT:    ret
+; CHECK-LABEL: gep3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    ldr x0, [x0, x1, lsl #3]
+; CHECK-NEXT:    str x1, [x8, x1, lsl #3]
+; CHECK-NEXT:    ret
   %g = getelementptr inbounds i64, ptr %p, i64 %b
   %l = load i64, ptr %g
   store i64 %b, ptr %g
diff --git a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
index 573f921..e31c9a0 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
@@ -134,9 +134,8 @@ define void @test8(i64 %a, ptr noalias %src, ptr noalias %dst, i64 %n) {
 ; CHECK-NEXT:    b.hs .LBB7_1
 ; CHECK-NEXT:  // %bb.3: // %if.then
 ; CHECK-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; CHECK-NEXT:    lsl x10, x8, #3
-; CHECK-NEXT:    ldr x11, [x1, x10]
-; CHECK-NEXT:    str x11, [x2, x10]
+; CHECK-NEXT:    ldr x10, [x1, x8, lsl #3]
+; CHECK-NEXT:    str x10, [x2, x8, lsl #3]
 ; CHECK-NEXT:    b .LBB7_1
 ; CHECK-NEXT:  .LBB7_4: // %exit
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll b/llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
index d593272..6bcd2f0 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
@@ -125,7 +125,7 @@ return:                                           ; preds = %if.end23, %if.then3
 }
 
 ; CHECK: @test
-; CHECK-NOT: , uxtw #2]
+; CHECK: , uxtw #2]
 define i32 @test(ptr %array, i8 zeroext %c, i32 %arg) {
 entry:
   %conv = zext i8 %c to i32
diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
index 3542b26..5b055a4 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
@@ -201,11 +201,10 @@ define void @fct1_64x1(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_64x1:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, :got:globalArray64x1
-; CHECK-NEXT:    lsl x9, x1, #3
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray64x1]
-; CHECK-NEXT:    ldr d0, [x0, x9]
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
 ; CHECK-NEXT:    ldr x8, [x8]
-; CHECK-NEXT:    str d0, [x8, x9]
+; CHECK-NEXT:    str d0, [x8, x1, lsl #3]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <1 x i64>, ptr %array, i64 %offset
@@ -238,11 +237,10 @@ define void @fct1_32x2(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_32x2:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, :got:globalArray32x2
-; CHECK-NEXT:    lsl x9, x1, #3
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray32x2]
-; CHECK-NEXT:    ldr d0, [x0, x9]
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
 ; CHECK-NEXT:    ldr x8, [x8]
-; CHECK-NEXT:    str d0, [x8, x9]
+; CHECK-NEXT:    str d0, [x8, x1, lsl #3]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <2 x i32>, ptr %array, i64 %offset
@@ -275,11 +273,10 @@ define void @fct1_16x4(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_16x4:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, :got:globalArray16x4
-; CHECK-NEXT:    lsl x9, x1, #3
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray16x4]
-; CHECK-NEXT:    ldr d0, [x0, x9]
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
 ; CHECK-NEXT:    ldr x8, [x8]
-; CHECK-NEXT:    str d0, [x8, x9]
+; CHECK-NEXT:    str d0, [x8, x1, lsl #3]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <4 x i16>, ptr %array, i64 %offset
@@ -312,11 +309,10 @@ define void @fct1_8x8(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_8x8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, :got:globalArray8x8
-; CHECK-NEXT:    lsl x9, x1, #3
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray8x8]
-; CHECK-NEXT:    ldr d0, [x0, x9]
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
 ; CHECK-NEXT:    ldr x8, [x8]
-; CHECK-NEXT:    str d0, [x8, x9]
+; CHECK-NEXT:    str d0, [x8, x1, lsl #3]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <8 x i8>, ptr %array, i64 %offset
diff --git a/llvm/test/CodeGen/AArch64/avoid-free-ext-promotion.ll b/llvm/test/CodeGen/AArch64/avoid-free-ext-promotion.ll
index 8f19553..634d1b9 100644
--- a/llvm/test/CodeGen/AArch64/avoid-free-ext-promotion.ll
+++ b/llvm/test/CodeGen/AArch64/avoid-free-ext-promotion.ll
@@ -82,13 +82,12 @@ define void @avoid_promotion_2_and(ptr nocapture noundef %arg) {
 ; CHECK-NEXT:    eor w10, w10, w11
 ; CHECK-NEXT:    ldur w11, [x8, #-24]
 ; CHECK-NEXT:    and w10, w10, w14
-; CHECK-NEXT:    ldp x15, x14, [x8, #-16]
-; CHECK-NEXT:    ubfiz x13, x10, #1, #32
+; CHECK-NEXT:    ldp x14, x13, [x8, #-16]
 ; CHECK-NEXT:    str w10, [x8]
-; CHECK-NEXT:    and w10, w11, w12
-; CHECK-NEXT:    ldrh w11, [x14, x13]
-; CHECK-NEXT:    strh w11, [x15, w10, uxtw #1]
-; CHECK-NEXT:    strh w12, [x14, x13]
+; CHECK-NEXT:    and w11, w11, w12
+; CHECK-NEXT:    ldrh w15, [x13, w10, uxtw #1]
+; CHECK-NEXT:    strh w15, [x14, w11, uxtw #1]
+; CHECK-NEXT:    strh w12, [x13, w10, uxtw #1]
 ; CHECK-NEXT:    b LBB1_1
 ; CHECK-NEXT:  LBB1_4: ; %exit
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll b/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll
index b5c2104..50c70c5 100644
--- a/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll
+++ b/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll
@@ -7,7 +7,7 @@ target triple = "aarch64-unknown-linux"
 define void @f0(ptr %a, i64 %n) {
 ; CHECK-LABEL: f0:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
 ; CHECK-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
@@ -15,7 +15,6 @@ define void @f0(ptr %a, i64 %n) {
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w21, -24
 ; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w23, -40
 ; CHECK-NEXT:    .cfi_offset w30, -48
 ; CHECK-NEXT:    mov x21, #1 // =0x1
 ; CHECK-NEXT:    mov x19, x1
@@ -27,18 +26,17 @@ define void @f0(ptr %a, i64 %n) {
 ; CHECK-NEXT:    b.ge .LBB0_2
 ; CHECK-NEXT:  .LBB0_1: // %loop.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lsl x23, x22, #2
+; CHECK-NEXT:    ldr w0, [x20, x22, lsl #2]
 ; CHECK-NEXT:    mov x1, x21
-; CHECK-NEXT:    ldr w0, [x20, x23]
 ; CHECK-NEXT:    bl g
-; CHECK-NEXT:    str w0, [x20, x23]
+; CHECK-NEXT:    str w0, [x20, x22, lsl #2]
 ; CHECK-NEXT:    add x22, x22, #1
 ; CHECK-NEXT:    cmp x22, x19
 ; CHECK-NEXT:    b.lt .LBB0_1
 ; CHECK-NEXT:  .LBB0_2: // %exit
 ; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
   br label %loop
@@ -64,15 +62,13 @@ exit:
 define void @f1(ptr %a, i64 %n) {
 ; CHECK-LABEL: f1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w30, -48
+; CHECK-NEXT:    .cfi_offset w30, -32
 ; CHECK-NEXT:    mov x19, x1
 ; CHECK-NEXT:    mov x20, x0
 ; CHECK-NEXT:    mov x21, xzr
@@ -80,19 +76,17 @@ define void @f1(ptr %a, i64 %n) {
 ; CHECK-NEXT:    b.ge .LBB1_2
 ; CHECK-NEXT:  .LBB1_1: // %loop.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lsl x22, x21, #2
+; CHECK-NEXT:    ldr w0, [x20, x21, lsl #2]
 ; CHECK-NEXT:    mov x1, #1450704896 // =0x56780000
 ; CHECK-NEXT:    movk x1, #4660, lsl #48
-; CHECK-NEXT:    ldr w0, [x20, x22]
 ; CHECK-NEXT:    bl g
-; CHECK-NEXT:    str w0, [x20, x22]
+; CHECK-NEXT:    str w0, [x20, x21, lsl #2]
 ; CHECK-NEXT:    add x21, x21, #1
 ; CHECK-NEXT:    cmp x21, x19
 ; CHECK-NEXT:    b.lt .LBB1_1
 ; CHECK-NEXT:  .LBB1_2: // %exit
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
   br label %loop
diff --git a/llvm/test/CodeGen/AArch64/extract-bits.ll b/llvm/test/CodeGen/AArch64/extract-bits.ll
index d4ea143..b87157a 100644
--- a/llvm/test/CodeGen/AArch64/extract-bits.ll
+++ b/llvm/test/CodeGen/AArch64/extract-bits.ll
@@ -972,10 +972,9 @@ define void @pr38938(ptr %a0, ptr %a1) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr x8, [x1]
 ; CHECK-NEXT:    ubfx x8, x8, #21, #10
-; CHECK-NEXT:    lsl x8, x8, #2
-; CHECK-NEXT:    ldr w9, [x0, x8]
+; CHECK-NEXT:    ldr w9, [x0, x8, lsl #2]
 ; CHECK-NEXT:    add w9, w9, #1
-; CHECK-NEXT:    str w9, [x0, x8]
+; CHECK-NEXT:    str w9, [x0, x8, lsl #2]
 ; CHECK-NEXT:    ret
   %tmp = load i64, ptr %a1, align 8
   %tmp1 = lshr i64 %tmp, 21
diff --git a/llvm/test/CodeGen/AArch64/hadd-combine.ll b/llvm/test/CodeGen/AArch64/hadd-combine.ll
index 491bf40..c0f7678 100644
--- a/llvm/test/CodeGen/AArch64/hadd-combine.ll
+++ b/llvm/test/CodeGen/AArch64/hadd-combine.ll
@@ -903,6 +903,58 @@ define <8 x i16> @shadd_fixedwidth_v8i16(<8 x i16> %a0, <8 x i16> %a1)  {
   ret <8 x i16> %res
 }
 
+define <8 x i16> @shadd_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: shadd_demandedelts:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer
+  %op = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %s0, <8 x i16> %a1)
+  %r0 = shufflevector <8 x i16> %op, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %r0
+}
+
+define <8 x i16> @srhadd_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: srhadd_demandedelts:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    srhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer
+  %op = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %s0, <8 x i16> %a1)
+  %r0 = shufflevector <8 x i16> %op, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %r0
+}
+
+define <8 x i16> @uhadd_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: uhadd_demandedelts:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer
+  %op = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %s0, <8 x i16> %a1)
+  %r0 = shufflevector <8 x i16> %op, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %r0
+}
+
+define <8 x i16> @urhadd_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: urhadd_demandedelts:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    urhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer
+  %op = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %s0, <8 x i16> %a1)
+  %r0 = shufflevector <8 x i16> %op, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %r0
+}
+
 declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>)
 declare <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16>, <4 x i16>)
 declare <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32>, <2 x i32>)
@@ -927,4 +979,4 @@ declare <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>)
 declare <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>)
 declare <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>)
 declare <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>)
-declare <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>)
+\ No newline at end of file
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
index 30123a3..e8dafd5 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
@@ -223,10 +223,9 @@ define i64 @three_dimensional_middle(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
 ; CHECK-NEXT:    // Parent Loop BB3_1 Depth=1
 ; CHECK-NEXT:    // => This Loop Header: Depth=2
 ; CHECK-NEXT:    // Child Loop BB3_3 Depth 3
-; CHECK-NEXT:    lsl x12, x11, #3
+; CHECK-NEXT:    ldr x13, [x1, x11, lsl #3]
+; CHECK-NEXT:    ldr x12, [x10, x11, lsl #3]
 ; CHECK-NEXT:    mov x14, x4
-; CHECK-NEXT:    ldr x13, [x1, x12]
-; CHECK-NEXT:    ldr x12, [x10, x12]
 ; CHECK-NEXT:    ldr w13, [x13]
 ; CHECK-NEXT:  .LBB3_3: // %for.body8
 ; CHECK-NEXT:    // Parent Loop BB3_1 Depth=1
diff --git a/llvm/test/CodeGen/AArch64/note-gnu-property-elf-pauthabi.ll b/llvm/test/CodeGen/AArch64/note-gnu-property-elf-pauthabi.ll
new file mode 100644
index 0000000..728cffe
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/note-gnu-property-elf-pauthabi.ll
@@ -0,0 +1,50 @@
+; RUN: rm -rf %t && split-file %s %t && cd %t
+
+;--- ok.ll
+
+; RUN: llc -mtriple=aarch64-linux ok.ll               -o - | \
+; RUN:   FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=aarch64-linux ok.ll -filetype=obj -o - |  \
+; RUN:   llvm-readelf --notes - | FileCheck %s --check-prefix=OBJ
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458}
+!1 = !{i32 1, !"aarch64-elf-pauthabi-version", i32 85}
+
+; ASM: .section .note.gnu.property,"a",@note
+; ASM-NEXT: .p2align 3, 0x0
+; ASM-NEXT: .word 4
+; ASM-NEXT: .word 24
+; ASM-NEXT: .word 5
+; ASM-NEXT: .asciz "GNU"
+; 3221225473 = 0xc0000001 = GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+; ASM-NEXT: .word 3221225473
+; ASM-NEXT: .word 16
+; ASM-NEXT: .xword 268435458
+; ASM-NEXT: .xword 85
+
+; OBJ: Displaying notes found in: .note.gnu.property
+; OBJ-NEXT:   Owner                 Data size	Description
+; OBJ-NEXT:   GNU                   0x00000018	NT_GNU_PROPERTY_TYPE_0 (property note)
+; OBJ-NEXT:   AArch64 PAuth ABI core info: platform 0x10000002 (llvm_linux), version 0x55 (PointerAuthIntrinsics, !PointerAuthCalls, PointerAuthReturns, !PointerAuthAuthTraps, PointerAuthVTPtrAddressDiscrimination, !PointerAuthVTPtrTypeDiscrimination, PointerAuthInitFini)
+
+; ERR: either both or no 'aarch64-elf-pauthabi-platform' and 'aarch64-elf-pauthabi-version' module flags must be present
+
+;--- err1.ll
+
+; RUN: not llc -mtriple=aarch64-linux err1.ll 2>&1 -o - | \
+; RUN:   FileCheck %s --check-prefix=ERR
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"aarch64-elf-pauthabi-platform", i32 2}
+
+;--- err2.ll
+
+; RUN: not llc -mtriple=aarch64-linux err2.ll 2>&1 -o - | \
+; RUN:   FileCheck %s --check-prefix=ERR
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"aarch64-elf-pauthabi-version", i32 31}
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat.ll b/llvm/test/CodeGen/AArch64/sadd_sat.ll
index 9e09b7f..789fd7b 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat.ll
@@ -2,8 +2,6 @@
 ; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc < %s -mtriple=aarch64-- -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for vec
-
 declare i4 @llvm.sadd.sat.i4(i4, i4)
 declare i8 @llvm.sadd.sat.i8(i8, i8)
 declare i16 @llvm.sadd.sat.i16(i16, i16)
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index 6f1ae02..8a0e766 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -2,28 +2,10 @@
 ; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc < %s -mtriple=aarch64-- -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for v16i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v64i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i16
+; CHECK-GI:       warning: Instruction selection used fallback path for v2i8
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i16
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i4
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i64
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i128
 
 declare <1 x i8> @llvm.sadd.sat.v1i8(<1 x i8>, <1 x i8>)
@@ -67,23 +49,37 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 }
 
 define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
-; CHECK-LABEL: v32i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    sqadd v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    sqadd v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    sqadd v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
   ret <32 x i8> %z
 }
 
 define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
-; CHECK-LABEL: v64i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v2.16b, v2.16b, v6.16b
-; CHECK-NEXT:    sqadd v0.16b, v0.16b, v4.16b
-; CHECK-NEXT:    sqadd v1.16b, v1.16b, v5.16b
-; CHECK-NEXT:    sqadd v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v64i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v2.16b, v2.16b, v6.16b
+; CHECK-SD-NEXT:    sqadd v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT:    sqadd v1.16b, v1.16b, v5.16b
+; CHECK-SD-NEXT:    sqadd v3.16b, v3.16b, v7.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v64i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT:    sqadd v1.16b, v1.16b, v5.16b
+; CHECK-GI-NEXT:    sqadd v2.16b, v2.16b, v6.16b
+; CHECK-GI-NEXT:    sqadd v3.16b, v3.16b, v7.16b
+; CHECK-GI-NEXT:    ret
   %z = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %x, <64 x i8> %y)
   ret <64 x i8> %z
 }
@@ -98,23 +94,37 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 }
 
 define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
-; CHECK-LABEL: v16i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v1.8h, v1.8h, v3.8h
-; CHECK-NEXT:    sqadd v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v1.8h, v1.8h, v3.8h
+; CHECK-SD-NEXT:    sqadd v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    sqadd v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
   ret <16 x i16> %z
 }
 
 define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
-; CHECK-LABEL: v32i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v2.8h, v2.8h, v6.8h
-; CHECK-NEXT:    sqadd v0.8h, v0.8h, v4.8h
-; CHECK-NEXT:    sqadd v1.8h, v1.8h, v5.8h
-; CHECK-NEXT:    sqadd v3.8h, v3.8h, v7.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v2.8h, v2.8h, v6.8h
+; CHECK-SD-NEXT:    sqadd v0.8h, v0.8h, v4.8h
+; CHECK-SD-NEXT:    sqadd v1.8h, v1.8h, v5.8h
+; CHECK-SD-NEXT:    sqadd v3.8h, v3.8h, v7.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.8h, v0.8h, v4.8h
+; CHECK-GI-NEXT:    sqadd v1.8h, v1.8h, v5.8h
+; CHECK-GI-NEXT:    sqadd v2.8h, v2.8h, v6.8h
+; CHECK-GI-NEXT:    sqadd v3.8h, v3.8h, v7.8h
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %x, <32 x i16> %y)
   ret <32 x i16> %z
 }
@@ -135,19 +145,42 @@ define void @v8i8(ptr %px, ptr %py, ptr %pz) nounwind {
 }
 
 define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v4i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ldr s1, [x1]
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    shl v1.4h, v1.4h, #8
-; CHECK-NEXT:    shl v0.4h, v0.4h, #8
-; CHECK-NEXT:    sqadd v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
-; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT:    str s0, [x2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    shl v1.4h, v1.4h, #8
+; CHECK-SD-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    sqadd v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    str s0, [x2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x1]
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov b5, v0.b[3]
+; CHECK-GI-NEXT:    mov b6, v1.b[3]
+; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v1.b[2]
+; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
+; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
+; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
+; CHECK-GI-NEXT:    sqadd v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    str w8, [x2]
+; CHECK-GI-NEXT:    ret
   %x = load <4 x i8>, ptr %px
   %y = load <4 x i8>, ptr %py
   %z = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %x, <4 x i8> %y)
@@ -196,23 +229,37 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 }
 
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v2i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-NEXT:    add x8, x0, #2
-; CHECK-NEXT:    add x9, x1, #2
-; CHECK-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-NEXT:    ld1 { v1.h }[2], [x9]
-; CHECK-NEXT:    shl v1.2s, v1.2s, #16
-; CHECK-NEXT:    shl v0.2s, v0.2s, #16
-; CHECK-NEXT:    sqadd v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    ushr v0.2s, v0.2s, #16
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w9, [x2]
-; CHECK-NEXT:    strh w8, [x2, #2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
+; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
+; CHECK-SD-NEXT:    add x8, x0, #2
+; CHECK-SD-NEXT:    add x9, x1, #2
+; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    sqadd v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ushr v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strh w9, [x2]
+; CHECK-SD-NEXT:    strh w8, [x2, #2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h2, [x1]
+; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    sqadd v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    str h0, [x2]
+; CHECK-GI-NEXT:    str h1, [x2, #2]
+; CHECK-GI-NEXT:    ret
   %x = load <2 x i16>, ptr %px
   %y = load <2 x i16>, ptr %py
   %z = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %x, <2 x i16> %y)
@@ -230,15 +277,27 @@ define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
 }
 
 define void @v12i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v12i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    sqadd v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    sqadd v1.8h, v2.8h, v3.8h
-; CHECK-NEXT:    str q0, [x2]
-; CHECK-NEXT:    str d1, [x2, #16]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v12i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldp q0, q3, [x1]
+; CHECK-SD-NEXT:    ldp q1, q2, [x0]
+; CHECK-SD-NEXT:    sqadd v0.8h, v1.8h, v0.8h
+; CHECK-SD-NEXT:    sqadd v1.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT:    str q0, [x2]
+; CHECK-SD-NEXT:    str d1, [x2, #16]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v12i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    ldr d2, [x0, #16]
+; CHECK-GI-NEXT:    ldr d3, [x1, #16]
+; CHECK-GI-NEXT:    sqadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    sqadd v1.4h, v2.4h, v3.4h
+; CHECK-GI-NEXT:    str q0, [x2]
+; CHECK-GI-NEXT:    str d1, [x2, #16]
+; CHECK-GI-NEXT:    ret
   %x = load <12 x i16>, ptr %px
   %y = load <12 x i16>, ptr %py
   %z = call <12 x i16> @llvm.sadd.sat.v12i16(<12 x i16> %x, <12 x i16> %y)
@@ -346,23 +405,37 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 }
 
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
-; CHECK-LABEL: v8i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    sqadd v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    sqadd v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    sqadd v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
 }
 
 define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
-; CHECK-LABEL: v16i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v2.4s, v2.4s, v6.4s
-; CHECK-NEXT:    sqadd v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    sqadd v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    sqadd v3.4s, v3.4s, v7.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v2.4s, v2.4s, v6.4s
+; CHECK-SD-NEXT:    sqadd v0.4s, v0.4s, v4.4s
+; CHECK-SD-NEXT:    sqadd v1.4s, v1.4s, v5.4s
+; CHECK-SD-NEXT:    sqadd v3.4s, v3.4s, v7.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    sqadd v1.4s, v1.4s, v5.4s
+; CHECK-GI-NEXT:    sqadd v2.4s, v2.4s, v6.4s
+; CHECK-GI-NEXT:    sqadd v3.4s, v3.4s, v7.4s
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
   ret <16 x i32> %z
 }
@@ -377,23 +450,37 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 }
 
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
-; CHECK-LABEL: v4i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v1.2d, v1.2d, v3.2d
-; CHECK-NEXT:    sqadd v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    sqadd v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    sqadd v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
   %z = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
 }
 
 define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
-; CHECK-LABEL: v8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v2.2d, v2.2d, v6.2d
-; CHECK-NEXT:    sqadd v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    sqadd v1.2d, v1.2d, v5.2d
-; CHECK-NEXT:    sqadd v3.2d, v3.2d, v7.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v2.2d, v2.2d, v6.2d
+; CHECK-SD-NEXT:    sqadd v0.2d, v0.2d, v4.2d
+; CHECK-SD-NEXT:    sqadd v1.2d, v1.2d, v5.2d
+; CHECK-SD-NEXT:    sqadd v3.2d, v3.2d, v7.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.2d, v0.2d, v4.2d
+; CHECK-GI-NEXT:    sqadd v1.2d, v1.2d, v5.2d
+; CHECK-GI-NEXT:    sqadd v2.2d, v2.2d, v6.2d
+; CHECK-GI-NEXT:    sqadd v3.2d, v3.2d, v7.2d
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
   ret <8 x i64> %z
 }
diff --git a/llvm/test/CodeGen/AArch64/sink-and-fold.ll b/llvm/test/CodeGen/AArch64/sink-and-fold.ll
index 5200722..f65a08a 100644
--- a/llvm/test/CodeGen/AArch64/sink-and-fold.ll
+++ b/llvm/test/CodeGen/AArch64/sink-and-fold.ll
@@ -100,7 +100,7 @@ exit:
 }
 
 ; Address calculation cheap enough on some cores.
-define i32 @f3(i1 %c1, ptr %p, i64 %i) nounwind  "target-features"="+alu-lsl-fast,+addr-lsl-fast" {
+define i32 @f3(i1 %c1, ptr %p, i64 %i) nounwind  "target-features"="+alu-lsl-fast" {
 ; CHECK-LABEL: f3:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    tbz w0, #0, .LBB3_2
@@ -130,7 +130,7 @@ exit:
   ret i32 %v
 }
 
-define void @f4(ptr %a, i64 %n) nounwind "target-features"="+alu-lsl-fast,+addr-lsl-fast" {
+define void @f4(ptr %a, i64 %n) nounwind "target-features"="+alu-lsl-fast" {
 ; CHECK-LABEL: f4:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    cmp x1, #1
diff --git a/llvm/test/CodeGen/AArch64/sms-regpress.mir b/llvm/test/CodeGen/AArch64/sms-regpress.mir
new file mode 100644
index 0000000..c75eba5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-regpress.mir
@@ -0,0 +1,160 @@
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-max-mii=40 -pipeliner-register-pressure -pipeliner-ii-search-range=30 -debug-only=pipeliner 2>&1 | FileCheck %s
+
+# REQUIRES: asserts
+
+# Check that if the register pressure is too high, the schedule is rejected, II is incremented, and scheduling continues.
+# The specific value of II is not important.
+
+# CHECK: {{^ *}}Try to schedule with {{[0-9]+$}}
+# CHECK: {{^ *}}Rejected the schedule because of too high register pressure{{$}}
+# CHECK: {{^ *}}Try to schedule with {{[0-9]+$}}
+# CHECK: {{^ *}}Schedule Found? 1 (II={{[0-9]+}}){{$}}
+
+--- |
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  
+  define dso_local double @kernel(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr {
+  entry:
+    %0 = load double, ptr %a, align 8
+    %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 8
+    %1 = load double, ptr %arrayidx1, align 8
+    %cmp133 = icmp sgt i32 %n, 0
+    br i1 %cmp133, label %for.body.preheader, label %for.cond.cleanup
+  
+  for.body.preheader:                               ; preds = %entry
+    %wide.trip.count = zext nneg i32 %n to i64
+    br label %for.body
+  
+  for.cond.cleanup:                                 ; preds = %for.body, %entry
+    %res.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add54, %for.body ]
+    ret double %res.0.lcssa
+  
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %lsr.iv137 = phi i64 [ %wide.trip.count, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
+    %lsr.iv = phi ptr [ %b, %for.body.preheader ], [ %scevgep, %for.body ]
+    %res.0135 = phi double [ 0.000000e+00, %for.body.preheader ], [ %add54, %for.body ]
+    %2 = load double, ptr %lsr.iv, align 8
+    %3 = tail call double @llvm.fmuladd.f64(double %0, double %2, double %0)
+    %4 = tail call double @llvm.fmuladd.f64(double %3, double %2, double %3)
+    %5 = tail call double @llvm.fmuladd.f64(double %4, double %2, double %4)
+    %6 = tail call double @llvm.fmuladd.f64(double %5, double %2, double %5)
+    %7 = tail call double @llvm.fmuladd.f64(double %6, double %2, double %6)
+    %8 = tail call double @llvm.fmuladd.f64(double %7, double %2, double %7)
+    %9 = tail call double @llvm.fmuladd.f64(double %8, double %2, double %8)
+    %10 = tail call double @llvm.fmuladd.f64(double %9, double %2, double %9)
+    %11 = tail call double @llvm.fmuladd.f64(double %10, double %2, double %10)
+    %12 = tail call double @llvm.fmuladd.f64(double %11, double %2, double %11)
+    %13 = tail call double @llvm.fmuladd.f64(double %12, double %2, double %12)
+    %14 = tail call double @llvm.fmuladd.f64(double %13, double %2, double %13)
+    %15 = tail call double @llvm.fmuladd.f64(double %14, double %2, double %14)
+    %16 = tail call double @llvm.fmuladd.f64(double %15, double %2, double %15)
+    %17 = tail call double @llvm.fmuladd.f64(double %16, double %2, double %16)
+    %18 = tail call double @llvm.fmuladd.f64(double %17, double %2, double %17)
+    %add = fadd double %17, %18
+    %19 = tail call double @llvm.fmuladd.f64(double %18, double %2, double %add)
+    %add35 = fadd double %10, %19
+    %20 = tail call double @llvm.fmuladd.f64(double %3, double %2, double %add35)
+    %add38 = fadd double %11, %20
+    %21 = tail call double @llvm.fmuladd.f64(double %4, double %2, double %add38)
+    %add41 = fadd double %12, %21
+    %22 = tail call double @llvm.fmuladd.f64(double %5, double %2, double %add41)
+    %add44 = fadd double %14, %15
+    %add45 = fadd double %13, %add44
+    %add46 = fadd double %add45, %22
+    %23 = tail call double @llvm.fmuladd.f64(double %6, double %2, double %add46)
+    %mul = fmul double %2, %7
+    %mul51 = fmul double %1, %mul
+    %24 = tail call double @llvm.fmuladd.f64(double %mul51, double %9, double %23)
+    %25 = tail call double @llvm.fmuladd.f64(double %8, double %1, double %24)
+    %add54 = fadd double %res.0135, %25
+    %scevgep = getelementptr i8, ptr %lsr.iv, i64 8
+    %lsr.iv.next = add nsw i64 %lsr.iv137, -1
+    %exitcond.not = icmp eq i64 %lsr.iv.next, 0
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  }
+  
+  declare double @llvm.fmuladd.f64(double, double, double)
+
+...
+---
+name:            kernel
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0', virtual-reg: '%10' }
+  - { reg: '$x1', virtual-reg: '%11' }
+  - { reg: '$w2', virtual-reg: '%12' }
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.4
+    liveins: $x0, $x1, $w2
+  
+    %12:gpr32common = COPY $w2
+    %11:gpr64 = COPY $x1
+    %10:gpr64common = COPY $x0
+    dead $wzr = SUBSWri %12, 1, 0, implicit-def $nzcv
+    Bcc 10, %bb.1, implicit $nzcv
+  
+  bb.4:
+    %13:fpr64 = FMOVD0
+    B %bb.2
+  
+  bb.1.for.body.preheader:
+    %0:fpr64 = LDRDui %10, 0 :: (load (s64) from %ir.a)
+    %1:fpr64 = LDRDui %10, 1 :: (load (s64) from %ir.arrayidx1)
+    %16:gpr32 = ORRWrs $wzr, %12, 0
+    %2:gpr64all = SUBREG_TO_REG 0, killed %16, %subreg.sub_32
+    %15:fpr64 = FMOVD0
+    B %bb.3
+  
+  bb.2.for.cond.cleanup:
+    %3:fpr64 = PHI %13, %bb.4, %7, %bb.3
+    $d0 = COPY %3
+    RET_ReallyLR implicit $d0
+  
+  bb.3.for.body:
+    successors: %bb.2, %bb.3
+  
+    %4:gpr64sp = PHI %2, %bb.1, %9, %bb.3
+    %5:gpr64sp = PHI %11, %bb.1, %8, %bb.3
+    %6:fpr64 = PHI %15, %bb.1, %7, %bb.3
+    early-clobber %17:gpr64sp, %18:fpr64 = LDRDpost %5, 8 :: (load (s64) from %ir.lsr.iv)
+    %19:fpr64 = nofpexcept FMADDDrrr %0, %18, %0, implicit $fpcr
+    %20:fpr64 = nofpexcept FMADDDrrr %19, %18, %19, implicit $fpcr
+    %21:fpr64 = nofpexcept FMADDDrrr %20, %18, %20, implicit $fpcr
+    %22:fpr64 = nofpexcept FMADDDrrr %21, %18, %21, implicit $fpcr
+    %23:fpr64 = nofpexcept FMADDDrrr %22, %18, %22, implicit $fpcr
+    %24:fpr64 = nofpexcept FMADDDrrr %23, %18, %23, implicit $fpcr
+    %25:fpr64 = nofpexcept FMADDDrrr %24, %18, %24, implicit $fpcr
+    %26:fpr64 = nofpexcept FMADDDrrr %25, %18, %25, implicit $fpcr
+    %27:fpr64 = nofpexcept FMADDDrrr %26, %18, %26, implicit $fpcr
+    %28:fpr64 = nofpexcept FMADDDrrr %27, %18, %27, implicit $fpcr
+    %29:fpr64 = nofpexcept FMADDDrrr %28, %18, %28, implicit $fpcr
+    %30:fpr64 = nofpexcept FMADDDrrr %29, %18, %29, implicit $fpcr
+    %31:fpr64 = nofpexcept FMADDDrrr %30, %18, %30, implicit $fpcr
+    %32:fpr64 = nofpexcept FMADDDrrr %31, %18, %31, implicit $fpcr
+    %33:fpr64 = nofpexcept FMADDDrrr %32, %18, %32, implicit $fpcr
+    %34:fpr64 = nofpexcept FMADDDrrr %33, %18, %33, implicit $fpcr
+    %35:fpr64 = nofpexcept FADDDrr %33, %34, implicit $fpcr
+    %36:fpr64 = nofpexcept FMADDDrrr %34, %18, killed %35, implicit $fpcr
+    %37:fpr64 = nofpexcept FADDDrr %26, killed %36, implicit $fpcr
+    %38:fpr64 = nofpexcept FMADDDrrr %19, %18, killed %37, implicit $fpcr
+    %39:fpr64 = nofpexcept FADDDrr %27, killed %38, implicit $fpcr
+    %40:fpr64 = nofpexcept FMADDDrrr %20, %18, killed %39, implicit $fpcr
+    %41:fpr64 = nofpexcept FADDDrr %28, killed %40, implicit $fpcr
+    %42:fpr64 = nofpexcept FMADDDrrr %21, %18, killed %41, implicit $fpcr
+    %43:fpr64 = nofpexcept FADDDrr %30, %31, implicit $fpcr
+    %44:fpr64 = nofpexcept FADDDrr %29, killed %43, implicit $fpcr
+    %45:fpr64 = nofpexcept FADDDrr killed %44, killed %42, implicit $fpcr
+    %46:fpr64 = nofpexcept FMADDDrrr %22, %18, killed %45, implicit $fpcr
+    %47:fpr64 = nofpexcept FMULDrr %18, %23, implicit $fpcr
+    %48:fpr64 = nofpexcept FMULDrr %1, killed %47, implicit $fpcr
+    %49:fpr64 = nofpexcept FMADDDrrr killed %48, %25, killed %46, implicit $fpcr
+    %50:fpr64 = nofpexcept FMADDDrrr %24, %1, killed %49, implicit $fpcr
+    %7:fpr64 = nofpexcept FADDDrr %6, killed %50, implicit $fpcr
+    %8:gpr64all = COPY %17
+    %51:gpr64 = nsw SUBSXri %4, 1, 0, implicit-def $nzcv
+    %9:gpr64all = COPY %51
+    Bcc 0, %bb.2, implicit $nzcv
+    B %bb.3
+
+...
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat.ll b/llvm/test/CodeGen/AArch64/ssub_sat.ll
index abeb4b3..4d755f4 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat.ll
@@ -2,8 +2,6 @@
 ; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc < %s -mtriple=aarch64-- -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for vec
-
 declare i4 @llvm.ssub.sat.i4(i4, i4)
 declare i8 @llvm.ssub.sat.i8(i8, i8)
 declare i16 @llvm.ssub.sat.i16(i16, i16)
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index d1f843a..a8c1276 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -2,28 +2,10 @@
 ; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc < %s -mtriple=aarch64-- -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for v16i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v64i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i16
+; CHECK-GI:       warning: Instruction selection used fallback path for v2i8
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i16
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i4
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i64
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i128
 
 declare <1 x i8> @llvm.ssub.sat.v1i8(<1 x i8>, <1 x i8>)
@@ -68,23 +50,37 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 }
 
 define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
-; CHECK-LABEL: v32i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    sqsub v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    sqsub v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    sqsub v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
   ret <32 x i8> %z
 }
 
 define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
-; CHECK-LABEL: v64i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v2.16b, v2.16b, v6.16b
-; CHECK-NEXT:    sqsub v0.16b, v0.16b, v4.16b
-; CHECK-NEXT:    sqsub v1.16b, v1.16b, v5.16b
-; CHECK-NEXT:    sqsub v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v64i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v2.16b, v2.16b, v6.16b
+; CHECK-SD-NEXT:    sqsub v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT:    sqsub v1.16b, v1.16b, v5.16b
+; CHECK-SD-NEXT:    sqsub v3.16b, v3.16b, v7.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v64i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT:    sqsub v1.16b, v1.16b, v5.16b
+; CHECK-GI-NEXT:    sqsub v2.16b, v2.16b, v6.16b
+; CHECK-GI-NEXT:    sqsub v3.16b, v3.16b, v7.16b
+; CHECK-GI-NEXT:    ret
   %z = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> %x, <64 x i8> %y)
   ret <64 x i8> %z
 }
@@ -99,23 +95,37 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 }
 
 define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
-; CHECK-LABEL: v16i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v1.8h, v1.8h, v3.8h
-; CHECK-NEXT:    sqsub v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v1.8h, v1.8h, v3.8h
+; CHECK-SD-NEXT:    sqsub v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    sqsub v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
   ret <16 x i16> %z
 }
 
 define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
-; CHECK-LABEL: v32i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v2.8h, v2.8h, v6.8h
-; CHECK-NEXT:    sqsub v0.8h, v0.8h, v4.8h
-; CHECK-NEXT:    sqsub v1.8h, v1.8h, v5.8h
-; CHECK-NEXT:    sqsub v3.8h, v3.8h, v7.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v2.8h, v2.8h, v6.8h
+; CHECK-SD-NEXT:    sqsub v0.8h, v0.8h, v4.8h
+; CHECK-SD-NEXT:    sqsub v1.8h, v1.8h, v5.8h
+; CHECK-SD-NEXT:    sqsub v3.8h, v3.8h, v7.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.8h, v0.8h, v4.8h
+; CHECK-GI-NEXT:    sqsub v1.8h, v1.8h, v5.8h
+; CHECK-GI-NEXT:    sqsub v2.8h, v2.8h, v6.8h
+; CHECK-GI-NEXT:    sqsub v3.8h, v3.8h, v7.8h
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %x, <32 x i16> %y)
   ret <32 x i16> %z
 }
@@ -136,19 +146,42 @@ define void @v8i8(ptr %px, ptr %py, ptr %pz) nounwind {
 }
 
 define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v4i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ldr s1, [x1]
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    shl v1.4h, v1.4h, #8
-; CHECK-NEXT:    shl v0.4h, v0.4h, #8
-; CHECK-NEXT:    sqsub v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
-; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT:    str s0, [x2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    shl v1.4h, v1.4h, #8
+; CHECK-SD-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    sqsub v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    str s0, [x2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x1]
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov b5, v0.b[3]
+; CHECK-GI-NEXT:    mov b6, v1.b[3]
+; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v1.b[2]
+; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
+; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
+; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
+; CHECK-GI-NEXT:    sqsub v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    str w8, [x2]
+; CHECK-GI-NEXT:    ret
   %x = load <4 x i8>, ptr %px
   %y = load <4 x i8>, ptr %py
   %z = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> %x, <4 x i8> %y)
@@ -197,23 +230,37 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 }
 
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v2i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-NEXT:    add x8, x0, #2
-; CHECK-NEXT:    add x9, x1, #2
-; CHECK-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-NEXT:    ld1 { v1.h }[2], [x9]
-; CHECK-NEXT:    shl v1.2s, v1.2s, #16
-; CHECK-NEXT:    shl v0.2s, v0.2s, #16
-; CHECK-NEXT:    sqsub v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    ushr v0.2s, v0.2s, #16
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w9, [x2]
-; CHECK-NEXT:    strh w8, [x2, #2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
+; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
+; CHECK-SD-NEXT:    add x8, x0, #2
+; CHECK-SD-NEXT:    add x9, x1, #2
+; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    sqsub v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ushr v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strh w9, [x2]
+; CHECK-SD-NEXT:    strh w8, [x2, #2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h2, [x1]
+; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    sqsub v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    str h0, [x2]
+; CHECK-GI-NEXT:    str h1, [x2, #2]
+; CHECK-GI-NEXT:    ret
   %x = load <2 x i16>, ptr %px
   %y = load <2 x i16>, ptr %py
   %z = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %x, <2 x i16> %y)
@@ -231,15 +278,27 @@ define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
 }
 
 define void @v12i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v12i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    sqsub v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    sqsub v1.8h, v2.8h, v3.8h
-; CHECK-NEXT:    str q0, [x2]
-; CHECK-NEXT:    str d1, [x2, #16]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v12i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldp q0, q3, [x1]
+; CHECK-SD-NEXT:    ldp q1, q2, [x0]
+; CHECK-SD-NEXT:    sqsub v0.8h, v1.8h, v0.8h
+; CHECK-SD-NEXT:    sqsub v1.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT:    str q0, [x2]
+; CHECK-SD-NEXT:    str d1, [x2, #16]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v12i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    ldr d2, [x0, #16]
+; CHECK-GI-NEXT:    ldr d3, [x1, #16]
+; CHECK-GI-NEXT:    sqsub v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    sqsub v1.4h, v2.4h, v3.4h
+; CHECK-GI-NEXT:    str q0, [x2]
+; CHECK-GI-NEXT:    str d1, [x2, #16]
+; CHECK-GI-NEXT:    ret
   %x = load <12 x i16>, ptr %px
   %y = load <12 x i16>, ptr %py
   %z = call <12 x i16> @llvm.ssub.sat.v12i16(<12 x i16> %x, <12 x i16> %y)
@@ -349,23 +408,37 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 }
 
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
-; CHECK-LABEL: v8i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    sqsub v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    sqsub v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    sqsub v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
 }
 
 define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
-; CHECK-LABEL: v16i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v2.4s, v2.4s, v6.4s
-; CHECK-NEXT:    sqsub v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    sqsub v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    sqsub v3.4s, v3.4s, v7.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v2.4s, v2.4s, v6.4s
+; CHECK-SD-NEXT:    sqsub v0.4s, v0.4s, v4.4s
+; CHECK-SD-NEXT:    sqsub v1.4s, v1.4s, v5.4s
+; CHECK-SD-NEXT:    sqsub v3.4s, v3.4s, v7.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    sqsub v1.4s, v1.4s, v5.4s
+; CHECK-GI-NEXT:    sqsub v2.4s, v2.4s, v6.4s
+; CHECK-GI-NEXT:    sqsub v3.4s, v3.4s, v7.4s
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
   ret <16 x i32> %z
 }
@@ -380,23 +453,37 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 }
 
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
-; CHECK-LABEL: v4i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v1.2d, v1.2d, v3.2d
-; CHECK-NEXT:    sqsub v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    sqsub v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    sqsub v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
   %z = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
 }
 
 define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
-; CHECK-LABEL: v8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v2.2d, v2.2d, v6.2d
-; CHECK-NEXT:    sqsub v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    sqsub v1.2d, v1.2d, v5.2d
-; CHECK-NEXT:    sqsub v3.2d, v3.2d, v7.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v2.2d, v2.2d, v6.2d
+; CHECK-SD-NEXT:    sqsub v0.2d, v0.2d, v4.2d
+; CHECK-SD-NEXT:    sqsub v1.2d, v1.2d, v5.2d
+; CHECK-SD-NEXT:    sqsub v3.2d, v3.2d, v7.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.2d, v0.2d, v4.2d
+; CHECK-GI-NEXT:    sqsub v1.2d, v1.2d, v5.2d
+; CHECK-GI-NEXT:    sqsub v2.2d, v2.2d, v6.2d
+; CHECK-GI-NEXT:    sqsub v3.2d, v3.2d, v7.2d
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
   ret <8 x i64> %z
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll b/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll
new file mode 100644
index 0000000..bcfc7b3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <4 x i32> @masked_load_v4i32(ptr %a, <4 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_load_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    shl v0.4s, v0.4s, #31
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %load = call <4 x i32> @llvm.masked.load.v4i32(ptr %a, i32 1, <4 x i1> %mask, <4 x i32> undef), !nontemporal !0
+  ret <4 x i32> %load
+}
+
+define void @masked_store_v4i32(<4 x i32> %x, ptr %a, <4 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_store_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    shl v1.4s, v1.4s, #31
+; CHECK-NEXT:    cmlt v1.4s, v1.4s, #0
+; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.store.v4i32.p0(<4 x i32> %x, ptr %a, i32 1, <4 x i1> %mask), !nontemporal !0
+  ret void
+}
+
+define <4 x i32> @load_v4i32(ptr %a) nounwind {
+; CHECK-LABEL: load_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+  %load = call <4 x i32> @llvm.masked.load.v4i32(ptr %a, i32 1, <4 x i1> <i1 1, i1 1, i1 1, i1 1>, <4 x i32> undef), !nontemporal !0
+  ret <4 x i32> %load
+}
+
+define void @store_v4i32(<4 x i32> %x, ptr %a) nounwind {
+; CHECK-LABEL: store_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.store.v4i32.p0(<4 x i32> %x, ptr %a, i32 1, <4 x i1> <i1 1, i1 1, i1 1, i1 1>), !nontemporal !0
+  ret void
+}
+
+define <vscale x 4 x i32> @masked_load_nxv4i32(ptr %a, <vscale x 4 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_load_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr %a, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef), !nontemporal !0
+  ret <vscale x 4 x i32> %load
+}
+
+define void @masked_store_nxv4i32(<vscale x 4 x i32> %x, ptr %a, <vscale x 4 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_store_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %x, ptr %a, i32 1, <vscale x 4 x i1> %mask), !nontemporal !0
+  ret void
+}
+
+declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr, i32, <vscale x 4 x i1>)
+declare <4 x i32> @llvm.masked.load.v4i32(ptr, i32, <4 x i1>, <4 x i32>)
+declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index f0bbed5..30ff700 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -2,28 +2,10 @@
 ; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc < %s -mtriple=aarch64-- -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for v16i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v64i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i16
+; CHECK-GI:       warning: Instruction selection used fallback path for v2i8
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i16
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i4
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i64
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i128
 
 declare <1 x i8> @llvm.uadd.sat.v1i8(<1 x i8>, <1 x i8>)
@@ -67,23 +49,37 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 }
 
 define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
-; CHECK-LABEL: v32i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    uqadd v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    uqadd v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    uqadd v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
   ret <32 x i8> %z
 }
 
 define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
-; CHECK-LABEL: v64i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v2.16b, v2.16b, v6.16b
-; CHECK-NEXT:    uqadd v0.16b, v0.16b, v4.16b
-; CHECK-NEXT:    uqadd v1.16b, v1.16b, v5.16b
-; CHECK-NEXT:    uqadd v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v64i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v2.16b, v2.16b, v6.16b
+; CHECK-SD-NEXT:    uqadd v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT:    uqadd v1.16b, v1.16b, v5.16b
+; CHECK-SD-NEXT:    uqadd v3.16b, v3.16b, v7.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v64i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT:    uqadd v1.16b, v1.16b, v5.16b
+; CHECK-GI-NEXT:    uqadd v2.16b, v2.16b, v6.16b
+; CHECK-GI-NEXT:    uqadd v3.16b, v3.16b, v7.16b
+; CHECK-GI-NEXT:    ret
   %z = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %x, <64 x i8> %y)
   ret <64 x i8> %z
 }
@@ -98,23 +94,37 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 }
 
 define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
-; CHECK-LABEL: v16i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v1.8h, v1.8h, v3.8h
-; CHECK-NEXT:    uqadd v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v1.8h, v1.8h, v3.8h
+; CHECK-SD-NEXT:    uqadd v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    uqadd v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
   ret <16 x i16> %z
 }
 
 define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
-; CHECK-LABEL: v32i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v2.8h, v2.8h, v6.8h
-; CHECK-NEXT:    uqadd v0.8h, v0.8h, v4.8h
-; CHECK-NEXT:    uqadd v1.8h, v1.8h, v5.8h
-; CHECK-NEXT:    uqadd v3.8h, v3.8h, v7.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v2.8h, v2.8h, v6.8h
+; CHECK-SD-NEXT:    uqadd v0.8h, v0.8h, v4.8h
+; CHECK-SD-NEXT:    uqadd v1.8h, v1.8h, v5.8h
+; CHECK-SD-NEXT:    uqadd v3.8h, v3.8h, v7.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.8h, v0.8h, v4.8h
+; CHECK-GI-NEXT:    uqadd v1.8h, v1.8h, v5.8h
+; CHECK-GI-NEXT:    uqadd v2.8h, v2.8h, v6.8h
+; CHECK-GI-NEXT:    uqadd v3.8h, v3.8h, v7.8h
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %x, <32 x i16> %y)
   ret <32 x i16> %z
 }
@@ -135,16 +145,39 @@ define void @v8i8(ptr %px, ptr %py, ptr %pz) nounwind {
 }
 
 define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v4i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s1, [x0]
-; CHECK-NEXT:    ldr s2, [x1]
-; CHECK-NEXT:    movi d0, #0xff00ff00ff00ff
-; CHECK-NEXT:    uaddl v1.8h, v1.8b, v2.8b
-; CHECK-NEXT:    umin v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT:    str s0, [x2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr s1, [x0]
+; CHECK-SD-NEXT:    ldr s2, [x1]
+; CHECK-SD-NEXT:    movi d0, #0xff00ff00ff00ff
+; CHECK-SD-NEXT:    uaddl v1.8h, v1.8b, v2.8b
+; CHECK-SD-NEXT:    umin v0.4h, v1.4h, v0.4h
+; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    str s0, [x2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x1]
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov b5, v0.b[3]
+; CHECK-GI-NEXT:    mov b6, v1.b[3]
+; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v1.b[2]
+; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
+; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
+; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
+; CHECK-GI-NEXT:    uqadd v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    str w8, [x2]
+; CHECK-GI-NEXT:    ret
   %x = load <4 x i8>, ptr %px
   %y = load <4 x i8>, ptr %py
   %z = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %x, <4 x i8> %y)
@@ -194,24 +227,38 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 }
 
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v2i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-NEXT:    ldrh w10, [x0, #2]
-; CHECK-NEXT:    ldrh w11, [x1, #2]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    mov v0.s[1], w10
-; CHECK-NEXT:    mov v1.s[1], w11
-; CHECK-NEXT:    add v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    umin v0.2s, v0.2s, v2.2s
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w9, [x2]
-; CHECK-NEXT:    strh w8, [x2, #2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldrh w8, [x0]
+; CHECK-SD-NEXT:    ldrh w9, [x1]
+; CHECK-SD-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    ldrh w10, [x0, #2]
+; CHECK-SD-NEXT:    ldrh w11, [x1, #2]
+; CHECK-SD-NEXT:    fmov s0, w8
+; CHECK-SD-NEXT:    fmov s1, w9
+; CHECK-SD-NEXT:    mov v0.s[1], w10
+; CHECK-SD-NEXT:    mov v1.s[1], w11
+; CHECK-SD-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    umin v0.2s, v0.2s, v2.2s
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strh w9, [x2]
+; CHECK-SD-NEXT:    strh w8, [x2, #2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h2, [x1]
+; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    uqadd v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    str h0, [x2]
+; CHECK-GI-NEXT:    str h1, [x2, #2]
+; CHECK-GI-NEXT:    ret
   %x = load <2 x i16>, ptr %px
   %y = load <2 x i16>, ptr %py
   %z = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %x, <2 x i16> %y)
@@ -229,15 +276,27 @@ define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
 }
 
 define void @v12i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v12i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    uqadd v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    uqadd v1.8h, v2.8h, v3.8h
-; CHECK-NEXT:    str q0, [x2]
-; CHECK-NEXT:    str d1, [x2, #16]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v12i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldp q0, q3, [x1]
+; CHECK-SD-NEXT:    ldp q1, q2, [x0]
+; CHECK-SD-NEXT:    uqadd v0.8h, v1.8h, v0.8h
+; CHECK-SD-NEXT:    uqadd v1.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT:    str q0, [x2]
+; CHECK-SD-NEXT:    str d1, [x2, #16]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v12i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    ldr d2, [x0, #16]
+; CHECK-GI-NEXT:    ldr d3, [x1, #16]
+; CHECK-GI-NEXT:    uqadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    uqadd v1.4h, v2.4h, v3.4h
+; CHECK-GI-NEXT:    str q0, [x2]
+; CHECK-GI-NEXT:    str d1, [x2, #16]
+; CHECK-GI-NEXT:    ret
   %x = load <12 x i16>, ptr %px
   %y = load <12 x i16>, ptr %py
   %z = call <12 x i16> @llvm.uadd.sat.v12i16(<12 x i16> %x, <12 x i16> %y)
@@ -336,23 +395,37 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 }
 
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
-; CHECK-LABEL: v8i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    uqadd v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    uqadd v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    uqadd v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
 }
 
 define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
-; CHECK-LABEL: v16i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v2.4s, v2.4s, v6.4s
-; CHECK-NEXT:    uqadd v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    uqadd v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    uqadd v3.4s, v3.4s, v7.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v2.4s, v2.4s, v6.4s
+; CHECK-SD-NEXT:    uqadd v0.4s, v0.4s, v4.4s
+; CHECK-SD-NEXT:    uqadd v1.4s, v1.4s, v5.4s
+; CHECK-SD-NEXT:    uqadd v3.4s, v3.4s, v7.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    uqadd v1.4s, v1.4s, v5.4s
+; CHECK-GI-NEXT:    uqadd v2.4s, v2.4s, v6.4s
+; CHECK-GI-NEXT:    uqadd v3.4s, v3.4s, v7.4s
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
   ret <16 x i32> %z
 }
@@ -367,23 +440,37 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 }
 
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
-; CHECK-LABEL: v4i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v1.2d, v1.2d, v3.2d
-; CHECK-NEXT:    uqadd v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    uqadd v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    uqadd v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
   %z = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
 }
 
 define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
-; CHECK-LABEL: v8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v2.2d, v2.2d, v6.2d
-; CHECK-NEXT:    uqadd v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    uqadd v1.2d, v1.2d, v5.2d
-; CHECK-NEXT:    uqadd v3.2d, v3.2d, v7.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v2.2d, v2.2d, v6.2d
+; CHECK-SD-NEXT:    uqadd v0.2d, v0.2d, v4.2d
+; CHECK-SD-NEXT:    uqadd v1.2d, v1.2d, v5.2d
+; CHECK-SD-NEXT:    uqadd v3.2d, v3.2d, v7.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.2d, v0.2d, v4.2d
+; CHECK-GI-NEXT:    uqadd v1.2d, v1.2d, v5.2d
+; CHECK-GI-NEXT:    uqadd v2.2d, v2.2d, v6.2d
+; CHECK-GI-NEXT:    uqadd v3.2d, v3.2d, v7.2d
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
   ret <8 x i64> %z
 }
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index 82c0327..3bc2796 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -2,28 +2,10 @@
 ; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc < %s -mtriple=aarch64-- -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for v16i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v64i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i16
+; CHECK-GI:       warning: Instruction selection used fallback path for v2i8
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i16
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i4
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i64
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i128
 
 declare <1 x i8> @llvm.usub.sat.v1i8(<1 x i8>, <1 x i8>)
@@ -68,23 +50,37 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 }
 
 define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
-; CHECK-LABEL: v32i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    uqsub v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    uqsub v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    uqsub v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
   ret <32 x i8> %z
 }
 
 define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
-; CHECK-LABEL: v64i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v2.16b, v2.16b, v6.16b
-; CHECK-NEXT:    uqsub v0.16b, v0.16b, v4.16b
-; CHECK-NEXT:    uqsub v1.16b, v1.16b, v5.16b
-; CHECK-NEXT:    uqsub v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v64i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v2.16b, v2.16b, v6.16b
+; CHECK-SD-NEXT:    uqsub v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT:    uqsub v1.16b, v1.16b, v5.16b
+; CHECK-SD-NEXT:    uqsub v3.16b, v3.16b, v7.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v64i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT:    uqsub v1.16b, v1.16b, v5.16b
+; CHECK-GI-NEXT:    uqsub v2.16b, v2.16b, v6.16b
+; CHECK-GI-NEXT:    uqsub v3.16b, v3.16b, v7.16b
+; CHECK-GI-NEXT:    ret
   %z = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %x, <64 x i8> %y)
   ret <64 x i8> %z
 }
@@ -99,23 +95,37 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 }
 
 define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
-; CHECK-LABEL: v16i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v1.8h, v1.8h, v3.8h
-; CHECK-NEXT:    uqsub v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v1.8h, v1.8h, v3.8h
+; CHECK-SD-NEXT:    uqsub v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    uqsub v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
   ret <16 x i16> %z
 }
 
 define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
-; CHECK-LABEL: v32i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v2.8h, v2.8h, v6.8h
-; CHECK-NEXT:    uqsub v0.8h, v0.8h, v4.8h
-; CHECK-NEXT:    uqsub v1.8h, v1.8h, v5.8h
-; CHECK-NEXT:    uqsub v3.8h, v3.8h, v7.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v2.8h, v2.8h, v6.8h
+; CHECK-SD-NEXT:    uqsub v0.8h, v0.8h, v4.8h
+; CHECK-SD-NEXT:    uqsub v1.8h, v1.8h, v5.8h
+; CHECK-SD-NEXT:    uqsub v3.8h, v3.8h, v7.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.8h, v0.8h, v4.8h
+; CHECK-GI-NEXT:    uqsub v1.8h, v1.8h, v5.8h
+; CHECK-GI-NEXT:    uqsub v2.8h, v2.8h, v6.8h
+; CHECK-GI-NEXT:    uqsub v3.8h, v3.8h, v7.8h
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %x, <32 x i16> %y)
   ret <32 x i16> %z
 }
@@ -136,16 +146,39 @@ define void @v8i8(ptr %px, ptr %py, ptr %pz) nounwind {
 }
 
 define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v4i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ldr s1, [x1]
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    uqsub v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT:    str s0, [x2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    uqsub v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    str s0, [x2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x1]
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov b5, v0.b[3]
+; CHECK-GI-NEXT:    mov b6, v1.b[3]
+; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v1.b[2]
+; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
+; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
+; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
+; CHECK-GI-NEXT:    uqsub v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    str w8, [x2]
+; CHECK-GI-NEXT:    ret
   %x = load <4 x i8>, ptr %px
   %y = load <4 x i8>, ptr %py
   %z = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %x, <4 x i8> %y)
@@ -193,22 +226,36 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 }
 
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v2i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    ldrh w10, [x0, #2]
-; CHECK-NEXT:    ldrh w11, [x1, #2]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    mov v0.s[1], w10
-; CHECK-NEXT:    mov v1.s[1], w11
-; CHECK-NEXT:    uqsub v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w9, [x2]
-; CHECK-NEXT:    strh w8, [x2, #2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldrh w8, [x0]
+; CHECK-SD-NEXT:    ldrh w9, [x1]
+; CHECK-SD-NEXT:    ldrh w10, [x0, #2]
+; CHECK-SD-NEXT:    ldrh w11, [x1, #2]
+; CHECK-SD-NEXT:    fmov s0, w8
+; CHECK-SD-NEXT:    fmov s1, w9
+; CHECK-SD-NEXT:    mov v0.s[1], w10
+; CHECK-SD-NEXT:    mov v1.s[1], w11
+; CHECK-SD-NEXT:    uqsub v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strh w9, [x2]
+; CHECK-SD-NEXT:    strh w8, [x2, #2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h2, [x1]
+; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    uqsub v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    str h0, [x2]
+; CHECK-GI-NEXT:    str h1, [x2, #2]
+; CHECK-GI-NEXT:    ret
   %x = load <2 x i16>, ptr %px
   %y = load <2 x i16>, ptr %py
   %z = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %x, <2 x i16> %y)
@@ -226,15 +273,27 @@ define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
 }
 
 define void @v12i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v12i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    uqsub v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    uqsub v1.8h, v2.8h, v3.8h
-; CHECK-NEXT:    str q0, [x2]
-; CHECK-NEXT:    str d1, [x2, #16]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v12i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldp q0, q3, [x1]
+; CHECK-SD-NEXT:    ldp q1, q2, [x0]
+; CHECK-SD-NEXT:    uqsub v0.8h, v1.8h, v0.8h
+; CHECK-SD-NEXT:    uqsub v1.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT:    str q0, [x2]
+; CHECK-SD-NEXT:    str d1, [x2, #16]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v12i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    ldr d2, [x0, #16]
+; CHECK-GI-NEXT:    ldr d3, [x1, #16]
+; CHECK-GI-NEXT:    uqsub v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    uqsub v1.4h, v2.4h, v3.4h
+; CHECK-GI-NEXT:    str q0, [x2]
+; CHECK-GI-NEXT:    str d1, [x2, #16]
+; CHECK-GI-NEXT:    ret
   %x = load <12 x i16>, ptr %px
   %y = load <12 x i16>, ptr %py
   %z = call <12 x i16> @llvm.usub.sat.v12i16(<12 x i16> %x, <12 x i16> %y)
@@ -334,23 +393,37 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 }
 
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
-; CHECK-LABEL: v8i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    uqsub v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    uqsub v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    uqsub v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
 }
 
 define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
-; CHECK-LABEL: v16i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v2.4s, v2.4s, v6.4s
-; CHECK-NEXT:    uqsub v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    uqsub v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    uqsub v3.4s, v3.4s, v7.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v2.4s, v2.4s, v6.4s
+; CHECK-SD-NEXT:    uqsub v0.4s, v0.4s, v4.4s
+; CHECK-SD-NEXT:    uqsub v1.4s, v1.4s, v5.4s
+; CHECK-SD-NEXT:    uqsub v3.4s, v3.4s, v7.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    uqsub v1.4s, v1.4s, v5.4s
+; CHECK-GI-NEXT:    uqsub v2.4s, v2.4s, v6.4s
+; CHECK-GI-NEXT:    uqsub v3.4s, v3.4s, v7.4s
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
   ret <16 x i32> %z
 }
@@ -365,23 +438,37 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 }
 
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
-; CHECK-LABEL: v4i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v1.2d, v1.2d, v3.2d
-; CHECK-NEXT:    uqsub v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    uqsub v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    uqsub v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
   %z = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
 }
 
 define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
-; CHECK-LABEL: v8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v2.2d, v2.2d, v6.2d
-; CHECK-NEXT:    uqsub v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    uqsub v1.2d, v1.2d, v5.2d
-; CHECK-NEXT:    uqsub v3.2d, v3.2d, v7.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v2.2d, v2.2d, v6.2d
+; CHECK-SD-NEXT:    uqsub v0.2d, v0.2d, v4.2d
+; CHECK-SD-NEXT:    uqsub v1.2d, v1.2d, v5.2d
+; CHECK-SD-NEXT:    uqsub v3.2d, v3.2d, v7.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.2d, v0.2d, v4.2d
+; CHECK-GI-NEXT:    uqsub v1.2d, v1.2d, v5.2d
+; CHECK-GI-NEXT:    uqsub v2.2d, v2.2d, v6.2d
+; CHECK-GI-NEXT:    uqsub v3.2d, v3.2d, v7.2d
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
   ret <8 x i64> %z
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index c25b0f2..78d9084 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -16,7 +16,6 @@ define i32 @divergent_if_swap_brtarget_order0(i32 %value) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  .LBB0_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %c = icmp ne i32 %value, 0
@@ -44,7 +43,6 @@ define i32 @divergent_if_swap_brtarget_order1(i32 %value) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  .LBB1_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %c = icmp ne i32 %value, 0
@@ -74,7 +72,6 @@ define i32 @divergent_if_nonboolean_condition0(i32 %value) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  .LBB2_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %c = trunc i32 %value to i1
@@ -106,7 +103,6 @@ define i32 @divergent_if_nonboolean_condition1(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  .LBB3_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %value = load i32, ptr addrspace(1) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
index 303dc46..5c22d5b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
@@ -131,8 +131,6 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:  .LBB1_2: ; %bb1
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = call i1 @llvm.amdgcn.is.private(ptr %ptr)
   br i1 %val, label %bb0, label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
index 63702d2..e005c38 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
@@ -131,8 +131,6 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:  .LBB1_2: ; %bb1
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = call i1 @llvm.amdgcn.is.shared(ptr %ptr)
   br i1 %val, label %bb0, label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 352adac..af6f6913 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -39,9 +39,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -65,11 +65,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB0_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -92,11 +92,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB0_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -253,8 +253,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
 ; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -504,11 +504,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB2_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -544,11 +544,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB2_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_add_u32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -944,7 +944,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
@@ -952,6 +951,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -974,7 +974,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:  .LBB4_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1006,7 +1005,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB4_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1219,11 +1217,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:  .LBB5_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s3, v2
 ; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
@@ -1258,11 +1256,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB5_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -1530,10 +1528,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -1557,12 +1555,12 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB7_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1585,12 +1583,12 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB7_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1751,8 +1749,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
 ; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -2006,11 +2004,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB9_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2046,11 +2044,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB9_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2446,7 +2444,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
@@ -2454,6 +2451,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -2477,7 +2475,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB11_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
@@ -2487,6 +2484,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2509,7 +2507,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB11_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
@@ -2519,6 +2516,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3081,11 +3079,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB14_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_and_b32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -3121,11 +3119,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB14_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_and_b32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3355,11 +3353,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB15_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_or_b32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -3395,11 +3393,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB15_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_or_b32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3629,11 +3627,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB16_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -3669,11 +3667,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB16_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3903,11 +3901,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB17_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_max_i32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -3943,11 +3941,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB17_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_max_i32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -4151,7 +4149,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, 1
@@ -4162,6 +4159,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -4182,7 +4180,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:  .LBB18_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4216,7 +4213,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB18_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4419,11 +4415,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB19_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_min_i32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -4459,11 +4455,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB19_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_min_i32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -4667,7 +4663,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, -2
@@ -4678,6 +4673,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -4698,7 +4694,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:  .LBB20_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_bfrev_b32_e32 v0, -2
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4732,7 +4727,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB20_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_bfrev_b32_e32 v0, -2
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4935,11 +4929,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB21_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_max_u32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -4975,11 +4969,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB21_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_max_u32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -5183,7 +5177,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
@@ -5193,6 +5186,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -5214,7 +5208,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB22_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
@@ -5226,6 +5219,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -5246,7 +5240,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB22_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -5258,6 +5251,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -5446,11 +5440,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB23_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_min_u32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -5486,11 +5480,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB23_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_min_u32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -5694,7 +5688,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -5704,6 +5697,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -5725,7 +5719,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB24_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -5737,6 +5730,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -5757,7 +5751,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB24_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -5769,6 +5762,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index 19a1d2d9..c9076a9 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -186,7 +186,7 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:  .LBB1_8: ; %atomicrmw.phi
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 9865883..bf4302c 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -5678,22 +5678,18 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_clause 0x4
-; GFX11-NEXT:    scratch_store_b128 off, v[18:21], s0 offset:64
-; GFX11-NEXT:    scratch_store_b128 off, v[10:13], s0 offset:32
-; GFX11-NEXT:    scratch_store_b128 off, v[6:9], s0 offset:16
-; GFX11-NEXT:    scratch_store_b128 off, v[2:5], s0
-; GFX11-NEXT:    scratch_store_b16 off, v1, s0 offset:128
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s0, s0, 48
+; GFX11-NEXT:    s_clause 0x5
+; GFX11-NEXT:    scratch_store_b128 v0, v[22:25], off offset:80
+; GFX11-NEXT:    scratch_store_b128 v0, v[18:21], off offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[14:17], off offset:48
+; GFX11-NEXT:    scratch_store_b128 v0, v[10:13], off offset:32
+; GFX11-NEXT:    scratch_store_b128 v0, v[6:9], off offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[2:5], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[30:33], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[26:29], s2
-; GFX11-NEXT:    scratch_store_b128 off, v[22:25], s3
-; GFX11-NEXT:    scratch_store_b128 off, v[14:17], s0
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_store_b128 v0, v[30:33], off offset:112
+; GFX11-NEXT:    scratch_store_b128 v0, v[26:29], off offset:96
+; GFX11-NEXT:    scratch_store_b16 v0, v1, off offset:128
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0
   %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1
@@ -8827,19 +8823,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    global_load_u16 v32, v[1:2], off offset:54
 ; GFX11-NEXT:    global_load_u16 v33, v[1:2], off offset:58
 ; GFX11-NEXT:    global_load_u16 v1, v[1:2], off offset:62
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xf0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s3, s0, 0xd0
-; GFX11-NEXT:    s_add_i32 s4, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s5, s0, 0xb0
-; GFX11-NEXT:    s_add_i32 s6, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s7, s0, 0x90
-; GFX11-NEXT:    s_add_i32 s8, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s9, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s10, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s11, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
 ; GFX11-NEXT:    s_waitcnt vmcnt(30)
@@ -8936,23 +8919,23 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[3:4], v2
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[1:2], v37
-; GFX11-NEXT:    scratch_store_b128 off, v[96:99], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[84:87], s2
-; GFX11-NEXT:    scratch_store_b128 off, v[80:83], s3
-; GFX11-NEXT:    scratch_store_b128 off, v[68:71], s4
-; GFX11-NEXT:    scratch_store_b128 off, v[64:67], s5
-; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s6
-; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s7
-; GFX11-NEXT:    scratch_store_b128 off, v[33:36], s0 offset:128
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s8
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s9
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s10
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s0 offset:64
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s11
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s0 offset:32
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0 offset:16
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
+; GFX11-NEXT:    s_clause 0xf
+; GFX11-NEXT:    scratch_store_b128 v0, v[96:99], off offset:240
+; GFX11-NEXT:    scratch_store_b128 v0, v[84:87], off offset:224
+; GFX11-NEXT:    scratch_store_b128 v0, v[80:83], off offset:208
+; GFX11-NEXT:    scratch_store_b128 v0, v[68:71], off offset:192
+; GFX11-NEXT:    scratch_store_b128 v0, v[64:67], off offset:176
+; GFX11-NEXT:    scratch_store_b128 v0, v[52:55], off offset:160
+; GFX11-NEXT:    scratch_store_b128 v0, v[48:51], off offset:144
+; GFX11-NEXT:    scratch_store_b128 v0, v[33:36], off offset:128
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <32 x bfloat>, ptr addrspace(1) %ptr
   %fpext = fpext <32 x bfloat> %load to <32 x double>
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
index ac50fb8..da609bf 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
@@ -41,7 +41,7 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr add
 ; GCN-NEXT:  .LBB0_2: ; %endif
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x3d0000
-; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_store_dword v1, v0, s[0:1] offset:2300
 ; GCN-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index 069c57e..6dabd8c 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -103,7 +103,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:  .LBB0_4: ; %exit
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0]
 ; GFX9-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-NEXT:    v_or_b32_e32 v1, 0xffff8000, v0
@@ -131,7 +130,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB0_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -266,7 +264,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:  .LBB1_4: ; %exit
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1]
 ; GFX9-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-NEXT:    v_or_b32_e32 v1, 0xffff8000, v0
@@ -294,7 +291,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB1_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -431,7 +427,6 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX9-NEXT:  .LBB2_4: ; %exit
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3900
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3d00
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3800
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
@@ -461,7 +456,6 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB2_4: ; %exit
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3d00
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -665,7 +659,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB3_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -871,7 +864,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB4_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1081,7 +1073,6 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB5_4: ; %exit
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3d00
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -1432,7 +1423,6 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB7_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v9, 0x3900
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3d00
@@ -1724,7 +1714,6 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB8_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v9, 0x3900
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3d00
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index db89ad6..3b2f15c 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -114,7 +114,6 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
 ; CIGFX89-NEXT:  .LBB3_2: ; %bb2
 ; CIGFX89-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i1_arg_i1_use:
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index acadee2..401cbce 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -1561,34 +1561,28 @@ define <33 x i32> @v33i32_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80
 ; GFX11-NEXT:    buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64
 ; GFX11-NEXT:    buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48
-; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16
-; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0
-; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_load_b32 v33, off, s[0:3], 0 offset:128
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s4, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s2
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:96
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s3
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:80
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s0 offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:64
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s4
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:48
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s0 offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:32
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s0
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0 offset:32
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v33, s0 offset:128
+; GFX11-NEXT:    scratch_store_b32 v0, v33, off offset:128
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <33 x i32>, ptr addrspace(1) %ptr
@@ -1850,34 +1844,28 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80
 ; GFX11-NEXT:    buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64
 ; GFX11-NEXT:    buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48
-; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16
-; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0
-; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_load_b32 v33, off, s[0:3], 0 offset:128
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s4, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s2
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:96
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s3
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:80
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s0 offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:64
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s4
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:48
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s0 offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:32
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s0
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0 offset:32
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v33, s0 offset:128
+; GFX11-NEXT:    scratch_store_b32 v0, v33, off offset:128
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load { <32 x i32>, i32 }, ptr addrspace(1) %ptr
@@ -2143,33 +2131,24 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0 offset:144
 ; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0 offset:128
 ; GFX11-NEXT:    buffer_load_b32 v33, off, s[0:3], 0
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xf0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s3, s0, 0xd0
-; GFX11-NEXT:    s_add_i32 s4, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s5, s0, 0xb0
-; GFX11-NEXT:    s_add_i32 s6, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s7, s0, 0x90
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:240
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s2
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:224
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s3
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:208
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s4
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:192
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s5
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:176
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s6
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:160
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s7
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:144
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0 offset:128
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off offset:128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v33, s0
+; GFX11-NEXT:    scratch_store_b32 v0, v33, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load { i32, <32 x i32> }, ptr addrspace(1) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index c1d6826..3b078c4 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -1989,256 +1989,138 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s2, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_clause 0x7
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:1024
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:512
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:256
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:128
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:64
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:32
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:16
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x7f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x7e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x7d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x7c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x7b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x7a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x790
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x780
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x770
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x760
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x750
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x740
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x730
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x720
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x710
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x700
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x6f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x6e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x6d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x6c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x6b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x6a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x690
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x680
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x670
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x660
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x650
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x640
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x630
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x620
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x610
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x600
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x5f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x5e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x5d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x5c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x5b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x5a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x590
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x580
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x570
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x560
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x550
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x540
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x530
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x520
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x510
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x500
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x4f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x4e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x4d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x4c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x4b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x4a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x490
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x480
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x470
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x460
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x450
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x440
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x430
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x420
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x410
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x3f0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x3e0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x3d0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x3c0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x3b0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x3a0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x390
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x380
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x370
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x360
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x350
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x340
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x330
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x320
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x310
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x300
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x2f0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x2e0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x2d0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x2c0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x2b0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x2a0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x290
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x280
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x270
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x260
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x250
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x240
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x230
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x220
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x210
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x1f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x1e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x1d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x1c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x1b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x1a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x190
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x180
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x170
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x160
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x150
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x140
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x130
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x120
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x110
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xf0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xd0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xb0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x90
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s0, s0, 48
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:2032
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:2016
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:2000
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1984
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1968
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1952
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1936
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1920
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1904
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1888
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1872
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1856
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1840
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1824
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1808
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1792
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1776
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1760
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1744
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1728
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1712
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1696
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1680
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1664
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1648
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1632
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1616
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1600
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1584
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1568
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1552
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1536
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1520
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1504
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1488
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1472
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1456
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1440
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1424
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1408
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1392
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1376
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1360
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1344
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1328
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1312
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1296
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1280
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1264
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1248
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1232
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1216
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1200
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1184
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1168
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1152
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1136
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1120
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1104
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1088
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1072
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1056
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1040
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1024
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1008
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:992
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:976
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:960
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:944
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:928
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:912
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:896
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:880
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:864
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:848
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:832
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:816
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:800
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:784
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:768
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:752
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:736
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:720
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:704
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:688
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:672
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:656
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:640
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:624
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:608
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:592
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:576
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:560
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:544
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:528
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:512
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:496
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:480
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:464
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:448
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:432
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:416
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:400
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:384
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:368
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:352
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:336
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:320
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:304
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:288
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:272
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:256
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:240
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:224
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:208
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:192
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:176
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:160
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:144
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:128
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:96
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:80
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:48
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   ret <512 x i32> zeroinitializer
@@ -2636,7 +2518,6 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-LABEL: return_72xi32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    s_clause 0xc
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:212
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:208
@@ -2651,93 +2532,82 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:168
 ; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:164
-; GFX11-NEXT:    s_clause 0x14
-; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:120
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s0 offset:64
+; GFX11-NEXT:    s_clause 0x11
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:88
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_load_b32 v23, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_b32 v22, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_b32 v21, off, s32 offset:104
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:64
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_load_b32 v20, off, s32 offset:144
-; GFX11-NEXT:    scratch_load_b32 v19, off, s32 offset:140
-; GFX11-NEXT:    scratch_load_b32 v18, off, s32 offset:136
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s0 offset:32
+; GFX11-NEXT:    scratch_load_b32 v19, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_b32 v18, off, s32 offset:124
+; GFX11-NEXT:    scratch_load_b32 v17, off, s32 offset:120
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_load_b32 v12, off, s32 offset:160
-; GFX11-NEXT:    scratch_load_b32 v11, off, s32 offset:156
-; GFX11-NEXT:    scratch_load_b32 v10, off, s32 offset:152
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0 offset:16
+; GFX11-NEXT:    scratch_load_b32 v15, off, s32 offset:144
+; GFX11-NEXT:    scratch_load_b32 v14, off, s32 offset:140
+; GFX11-NEXT:    scratch_load_b32 v13, off, s32 offset:136
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
 ; GFX11-NEXT:    s_clause 0xd
-; GFX11-NEXT:    scratch_load_b32 v8, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v7, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v6, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v5, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v9, off, s32 offset:148
-; GFX11-NEXT:    scratch_load_b32 v17, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:160
+; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:156
+; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:152
+; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:148
+; GFX11-NEXT:    scratch_load_b32 v12, off, s32 offset:132
+; GFX11-NEXT:    scratch_load_b32 v16, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_b32 v20, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:68
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x110
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x100
-; GFX11-NEXT:    s_add_i32 s3, s0, 0xf0
-; GFX11-NEXT:    s_add_i32 s34, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s35, s0, 0xd0
-; GFX11-NEXT:    s_add_i32 s36, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s37, s0, 0xb0
-; GFX11-NEXT:    s_add_i32 s38, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s39, s0, 0x90
-; GFX11-NEXT:    s_add_i32 s40, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s41, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s42, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s43, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0 offset:128
+; GFX11-NEXT:    scratch_store_b128 v0, v[60:63], off offset:272
 ; GFX11-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s1
+; GFX11-NEXT:    scratch_store_b128 v0, v[12:15], off offset:256
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s2
+; GFX11-NEXT:    scratch_store_b128 v0, v[16:19], off offset:240
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    scratch_store_b128 off, v[60:63], s3
+; GFX11-NEXT:    scratch_store_b128 v0, v[20:23], off offset:224
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    scratch_store_b128 off, v[56:59], s34
+; GFX11-NEXT:    scratch_store_b128 v0, v[56:59], off offset:208
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[41:44], s35
+; GFX11-NEXT:    scratch_store_b128 v0, v[41:44], off offset:192
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[37:40], s36
+; GFX11-NEXT:    scratch_store_b128 v0, v[37:40], off offset:176
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s37
+; GFX11-NEXT:    scratch_store_b128 v0, v[52:55], off offset:160
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s38
+; GFX11-NEXT:    scratch_store_b128 v0, v[48:51], off offset:144
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[33:36], s39
+; GFX11-NEXT:    scratch_store_b128 v0, v[33:36], off offset:128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s40
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s41
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s42
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s43
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-NEXT:    s_clause 0xc
 ; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:164
 ; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:168
@@ -3306,7 +3176,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-LABEL: call_72xi32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s46, s33
+; GFX11-NEXT:    s_mov_b32 s34, s33
 ; GFX11-NEXT:    s_add_i32 s33, s32, 0x1ff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s33, s33, 0xfffffe00
@@ -3353,11 +3223,11 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
 ; GFX11-NEXT:    s_add_i32 s0, s32, 32
 ; GFX11-NEXT:    s_add_i32 s1, s32, 16
+; GFX11-NEXT:    s_add_i32 s2, s33, 0x200
+; GFX11-NEXT:    v_writelane_b32 v60, s30, 0
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
-; GFX11-NEXT:    s_add_i32 s0, s33, 0x200
-; GFX11-NEXT:    v_writelane_b32 v60, s30, 0
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, 0
@@ -3373,14 +3243,14 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v26, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v28, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0
-; GFX11-NEXT:    s_mov_b32 s45, return_72xi32@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s44, return_72xi32@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s1, return_72xi32@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, return_72xi32@abs32@lo
 ; GFX11-NEXT:    v_writelane_b32 v60, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[44:45]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_load_b128 v[45:48], off, s33 offset:624
 ; GFX11-NEXT:    scratch_load_b128 v[33:36], off, s33 offset:640
-; GFX11-NEXT:    s_add_i32 s0, s32, 0xa0
+; GFX11-NEXT:    s_add_i32 s2, s32, 0xa0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_mov_b32_e32 v32, v48
 ; GFX11-NEXT:    s_clause 0x9
@@ -3431,38 +3301,38 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v3, v6
 ; GFX11-NEXT:    v_dual_mov_b32 v5, v8 :: v_dual_mov_b32 v6, v9
 ; GFX11-NEXT:    v_mov_b32_e32 v9, v20
-; GFX11-NEXT:    scratch_store_b32 off, v11, s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x90
+; GFX11-NEXT:    scratch_store_b32 off, v11, s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x90
 ; GFX11-NEXT:    v_mov_b32_e32 v11, v22
-; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x80
+; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x80
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v16
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s2
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 24
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x70
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x70
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v17
-; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s0
+; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s2
 ; GFX11-NEXT:    v_mov_b32_e32 v13, v24
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x6c
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x6c
 ; GFX11-NEXT:    v_mov_b32_e32 v7, v18
-; GFX11-NEXT:    scratch_store_b32 off, v0, s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x60
+; GFX11-NEXT:    scratch_store_b32 off, v0, s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x60
 ; GFX11-NEXT:    v_dual_mov_b32 v8, v19 :: v_dual_mov_b32 v15, v26
-; GFX11-NEXT:    scratch_store_b96 off, v[56:58], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x50
+; GFX11-NEXT:    scratch_store_b96 off, v[56:58], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x50
 ; GFX11-NEXT:    v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45
-; GFX11-NEXT:    scratch_store_b128 off, v[40:43], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 64
+; GFX11-NEXT:    scratch_store_b128 off, v[40:43], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 64
 ; GFX11-NEXT:    v_mov_b32_e32 v14, v25
-; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 48
+; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 48
 ; GFX11-NEXT:    v_mov_b32_e32 v16, v27
-; GFX11-NEXT:    scratch_store_b128 off, v[36:39], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 32
+; GFX11-NEXT:    scratch_store_b128 off, v[36:39], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 32
 ; GFX11-NEXT:    v_mov_b32_e32 v30, v46
-; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 16
-; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s0
+; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 16
+; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s2
 ; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1588 ; 16-byte Folded Reload
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 42
@@ -3470,10 +3340,10 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_load_b128 v[17:20], off, s33 offset:1572
 ; GFX11-NEXT:    scratch_load_b128 v[21:24], off, s33 offset:1556
 ; GFX11-NEXT:    scratch_load_b128 v[25:28], off, s33 offset:1540
-; GFX11-NEXT:    s_add_i32 s0, s33, 0x400
+; GFX11-NEXT:    s_add_i32 s2, s33, 0x400
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[44:45]
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_clause 0xb
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s33
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s33 offset:4
@@ -3493,7 +3363,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s33 offset:1536 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_addk_i32 s32, 0xf600
-; GFX11-NEXT:    s_mov_b32 s33, s46
+; GFX11-NEXT:    s_mov_b32 s33, s34
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
index 433a836..3b3e107 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
@@ -33,7 +33,7 @@ define void @func_use_lds_global() {
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-SDAG-NEXT:    s_mov_b32 m0, -1
-; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0xc8
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -103,7 +103,7 @@ define void @func_use_lds_global_constexpr_cast() {
 ; GFX8-SDAG-LABEL: func_use_lds_global_constexpr_cast:
 ; GFX8-SDAG:       ; %bb.0:
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0xc8
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_trap 2
@@ -171,7 +171,7 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; GFX8-SDAG-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX8-SDAG-NEXT:  ; %bb.1: ; %bb1
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 1
-; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0xc8
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -181,7 +181,7 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; GFX8-SDAG-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX8-SDAG-NEXT:  ; %bb.3: ; %bb0
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0xc8
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -189,7 +189,7 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; GFX8-SDAG-NEXT:  .LBB2_4: ; %ret
 ; GFX8-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0xc8
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -379,7 +379,7 @@ define void @func_uses_lds_code_after(ptr addrspace(1) %ptr) {
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-SDAG-NEXT:    s_mov_b32 m0, -1
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v2
-; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0xc8
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -472,7 +472,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX8-SDAG-NEXT:  ; %bb.1: ; %use.bb
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-SDAG-NEXT:    s_mov_b32 m0, -1
-; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0xc8
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -481,7 +481,6 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-SDAG-NEXT:  .LBB4_2: ; %ret
 ; GFX8-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: func_uses_lds_phi_after:
@@ -506,7 +505,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-GISEL-NEXT:  .LBB4_2: ; %ret
 ; GFX8-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: func_uses_lds_phi_after:
@@ -527,7 +526,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:  .LBB4_2: ; %ret
 ; GFX9-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: func_uses_lds_phi_after:
@@ -548,7 +547,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:  .LBB4_2: ; %ret
 ; GFX9-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-LABEL: func_uses_lds_phi_after:
@@ -570,7 +569,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:  .LBB4_3: ; %ret
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ; SDAG-NEXT:  .LBB4_4:
 ; SDAG-NEXT:    s_endpgm
@@ -594,7 +593,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:  .LBB4_3: ; %ret
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ; GISEL-NEXT:  .LBB4_4:
 ; GISEL-NEXT:    s_endpgm
@@ -616,6 +615,3 @@ ret:
 ; CHECK: {{.*}}
 ; GFX8: {{.*}}
 ; GFX9: {{.*}}
-
-!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index 5e76dfd..4477f02 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -157,7 +157,6 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; VI-NEXT:  .LBB2_2:
 ; VI-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; VI-NEXT:    s_mov_b64 s[6:7], exec
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_readfirstlane_b32 s8, v1
 ; VI-NEXT:    v_mbcnt_lo_u32_b32 v1, s6, 0
 ; VI-NEXT:    v_mbcnt_hi_u32_b32 v1, s7, v1
@@ -203,15 +202,14 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; VI-NEXT:  ; %bb.7:
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    ds_add_rtn_f32 v2, v2, v1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:  .LBB2_8:
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_readfirstlane_b32 s2, v2
 ; VI-NEXT:    v_add_f32_e32 v2, s2, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -240,7 +238,6 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; GFX9-NEXT:  .LBB2_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX9-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, s6, 0
 ; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v1, s7, v1
@@ -285,16 +282,15 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; GFX9-NEXT:    s_cbranch_execz .LBB2_8
 ; GFX9-NEXT:  ; %bb.7:
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    ds_add_rtn_f32 v2, v2, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB2_8:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_add_f32_e32 v0, s2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index 138dd53..d19ef75 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1260,8 +1260,6 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 {
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:  .LBB11_5: ; %end
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ; GFX11-NEXT:  .LBB11_6:
 ; GFX11-NEXT:    s_mov_b64 exec, 0
@@ -1525,8 +1523,6 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:  .LBB13_5: ; %UnifiedReturnBlock
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ; GFX11-NEXT:  .LBB13_6:
 ; GFX11-NEXT:    s_mov_b64 exec, 0
diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
index eef5f57..ecebbb9 100644
--- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
+++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
@@ -32,7 +32,7 @@ define amdgpu_ps float @test_return_to_epilog_into_end_block(i32 inreg %a, float
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GCN-NEXT:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN-NEXT:   S_WAITCNT_soft 3952
+  ; GCN-NEXT:   S_WAITCNT 3952
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.3:
 entry:
@@ -79,7 +79,7 @@ define amdgpu_ps float @test_unify_return_to_epilog_into_end_block(i32 inreg %a,
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GCN-NEXT:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN-NEXT:   S_WAITCNT_soft 3952
+  ; GCN-NEXT:   S_WAITCNT 3952
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.5:
 entry:
diff --git a/llvm/test/CodeGen/Generic/allow-check.ll b/llvm/test/CodeGen/Generic/allow-check.ll
index 43dab68..a084889 100644
--- a/llvm/test/CodeGen/Generic/allow-check.ll
+++ b/llvm/test/CodeGen/Generic/allow-check.ll
@@ -2,6 +2,7 @@
 ; REQUIRES: host-byteorder-little-endian
 
 ; -global-isel=1 is unsupported.
+; XFAIL: target=loongarch{{.*}}
 ; XFAIL: target=nvptx{{.*}}
 ; XFAIL: target=sparc{{.*}}
 ; XFAIL: target=hexagon-{{.*}}
diff --git a/llvm/test/CodeGen/PowerPC/legalize-vaarg.ll b/llvm/test/CodeGen/PowerPC/legalize-vaarg.ll
index b7f8b8a..8980049 100644
--- a/llvm/test/CodeGen/PowerPC/legalize-vaarg.ll
+++ b/llvm/test/CodeGen/PowerPC/legalize-vaarg.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ;RUN: llc < %s --mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec | FileCheck %s -check-prefix=BE
 ;RUN: llc < %s --mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec | FileCheck %s -check-prefix=LE
+;RUN: llc < %s --mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec -ppc-gather-alias-max-depth=0 | FileCheck %s -check-prefix=FORWARD
 
 define <8 x i32> @test_large_vec_vaarg(i32 %n, ...) {
 ; BE-LABEL: test_large_vec_vaarg:
@@ -35,6 +36,22 @@ define <8 x i32> @test_large_vec_vaarg(i32 %n, ...) {
 ; LE-NEXT:    lxvd2x 0, 0, 3
 ; LE-NEXT:    xxswapd 35, 0
 ; LE-NEXT:    blr
+;
+; FORWARD-LABEL: test_large_vec_vaarg:
+; FORWARD:       # %bb.0:
+; FORWARD-NEXT:    ld 3, -8(1)
+; FORWARD-NEXT:    addi 3, 3, 15
+; FORWARD-NEXT:    rldicr 3, 3, 0, 59
+; FORWARD-NEXT:    addi 4, 3, 16
+; FORWARD-NEXT:    std 4, -8(1)
+; FORWARD-NEXT:    ld 4, -8(1)
+; FORWARD-NEXT:    lvx 2, 0, 3
+; FORWARD-NEXT:    addi 4, 4, 15
+; FORWARD-NEXT:    rldicr 3, 4, 0, 59
+; FORWARD-NEXT:    addi 4, 3, 16
+; FORWARD-NEXT:    std 4, -8(1)
+; FORWARD-NEXT:    lvx 3, 0, 3
+; FORWARD-NEXT:    blr
   %args = alloca ptr, align 4
   %x = va_arg ptr %args, <8 x i32>
   ret <8 x i32> %x
diff --git a/llvm/test/CodeGen/PowerPC/sms-regpress.mir b/llvm/test/CodeGen/PowerPC/sms-regpress.mir
index cebd78a..b01115c 100644
--- a/llvm/test/CodeGen/PowerPC/sms-regpress.mir
+++ b/llvm/test/CodeGen/PowerPC/sms-regpress.mir
@@ -1,41 +1,30 @@
-# RUN: llc --verify-machineinstrs -mcpu=pwr9 -o - %s -run-pass=pipeliner -ppc-enable-pipeliner  -pipeliner-register-pressure -pipeliner-max-mii=50 -pipeliner-ii-search-range=30 -pipeliner-max-stages=10 -debug-only=pipeliner 2>&1 | FileCheck %s
+# RUN: llc --verify-machineinstrs -mcpu=pwr9 -o - %s -run-pass=pipeliner -ppc-enable-pipeliner -pipeliner-register-pressure -pipeliner-max-mii=50 -pipeliner-ii-search-range=30 -pipeliner-max-stages=10 -debug-only=pipeliner 2>&1 | FileCheck %s
 
 # REQUIRES: asserts
 
 # Check that if the register pressure is too high, the schedule is rejected, II is incremented, and scheduling continues.
 # The specific value of II is not important.
 
-# CHECK: Try to schedule with 21
-# CHECK: 	Can't schedule
-# CHECK: Try to schedule with 22
-# CHECK: 	Can't schedule
-# CHECK: Try to schedule with 23
-# CHECK: Rejected the schedule because of too high register pressure
-# CHECK: Try to schedule with 24
-# CHECK: Rejected the schedule because of too high register pressure
-# CHECK: Try to schedule with 25
-# CHECK: Rejected the schedule because of too high register pressure
-# CHECK: Try to schedule with 26
-# CHECK: Schedule Found? 1 (II=26)
+# CHECK: {{^ *}}Try to schedule with {{[0-9]+$}}
+# CHECK: {{^ *}}Rejected the schedule because of too high register pressure{{$}}
+# CHECK: {{^ *}}Try to schedule with {{[0-9]+$}}
+# CHECK: {{^ *}}Schedule Found? 1 (II={{[0-9]+}}){{$}}
 
 --- |
-  ; ModuleID = 'a.ll'
-  source_filename = "a.c"
   target datalayout = "e-m:e-Fn32-i64:64-n32:64"
   target triple = "ppc64le"
   
-  ; Function Attrs: nofree nosync nounwind memory(argmem: read) uwtable
-  define dso_local double @kernel(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef signext %n) local_unnamed_addr #0 {
+  define dso_local double @kernel(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef signext %n) local_unnamed_addr {
   entry:
-    %0 = load double, ptr %a, align 8, !tbaa !3
-    %arrayidx1 = getelementptr inbounds double, ptr %a, i64 1
-    %1 = load double, ptr %arrayidx1, align 8, !tbaa !3
+    %0 = load double, ptr %a, align 8
+    %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 8
+    %1 = load double, ptr %arrayidx1, align 8
     %cmp163 = icmp sgt i32 %n, 0
     br i1 %cmp163, label %for.body.preheader, label %for.cond.cleanup
   
   for.body.preheader:                               ; preds = %entry
-    %wide.trip.count = zext i32 %n to i64
-    %scevgep1 = getelementptr i8, ptr %b, i64 -8
+    %wide.trip.count = zext nneg i32 %n to i64
+    %scevgep167 = getelementptr i8, ptr %b, i64 -8
     call void @llvm.set.loop.iterations.i64(i64 %wide.trip.count)
     br label %for.body
   
@@ -43,11 +32,11 @@
     %res.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %30, %for.body ]
     ret double %res.0.lcssa
   
-  for.body:                                         ; preds = %for.body, %for.body.preheader
+  for.body:                                         ; preds = %for.body.preheader, %for.body
     %res.0165 = phi double [ 0.000000e+00, %for.body.preheader ], [ %30, %for.body ]
-    %2 = phi ptr [ %scevgep1, %for.body.preheader ], [ %3, %for.body ]
+    %2 = phi ptr [ %scevgep167, %for.body.preheader ], [ %3, %for.body ]
     %3 = getelementptr i8, ptr %2, i64 8
-    %4 = load double, ptr %3, align 8, !tbaa !3
+    %4 = load double, ptr %3, align 8
     %5 = tail call double @llvm.fmuladd.f64(double %0, double %4, double %0)
     %6 = tail call double @llvm.fmuladd.f64(double %5, double %4, double %5)
     %7 = tail call double @llvm.fmuladd.f64(double %6, double %4, double %6)
@@ -92,152 +81,23 @@
     %mul66 = fmul double %12, %mul65
     %30 = tail call double @llvm.fmuladd.f64(double %mul66, double %10, double %res.0165)
     %31 = call i1 @llvm.loop.decrement.i64(i64 1)
-    br i1 %31, label %for.body, label %for.cond.cleanup, !llvm.loop !7
+    br i1 %31, label %for.body, label %for.cond.cleanup
   }
   
-  ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-  declare double @llvm.fmuladd.f64(double, double, double) #1
+  declare double @llvm.fmuladd.f64(double, double, double)
   
-  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
-  declare void @llvm.set.loop.iterations.i64(i64) #2
+  declare void @llvm.set.loop.iterations.i64(i64)
   
-  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
-  declare i1 @llvm.loop.decrement.i64(i64) #2
+  declare i1 @llvm.loop.decrement.i64(i64)
   
-  attributes #0 = { nofree nosync nounwind memory(argmem: read) uwtable "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crbits,+crypto,+direct-move,+extdiv,+htm,+isa-v206-instructions,+isa-v207-instructions,+isa-v30-instructions,+power8-vector,+power9-vector,+quadword-atomics,+vsx,-aix-small-local-exec-tls,-privileged,-rop-protect,-spe" }
-  attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-  attributes #2 = { nocallback noduplicate nofree nosync nounwind willreturn }
-  
-  !llvm.module.flags = !{!0, !1}
-  !llvm.ident = !{!2}
-  
-  !0 = !{i32 1, !"wchar_size", i32 4}
-  !1 = !{i32 7, !"uwtable", i32 2}
-  !2 = !{!"clang version 18.0.0 (https://miratech-soft@dev.azure.com/miratech-soft/llvm/_git/llvm c8d01fb665fc5d9378100a6d92ebcd3be49be655)"}
-  !3 = !{!4, !4, i64 0}
-  !4 = !{!"double", !5, i64 0}
-  !5 = !{!"omnipotent char", !6, i64 0}
-  !6 = !{!"Simple C/C++ TBAA"}
-  !7 = distinct !{!7, !8, !9}
-  !8 = !{!"llvm.loop.mustprogress"}
-  !9 = !{!"llvm.loop.unroll.disable"}
-
 ...
 ---
 name:            kernel
-alignment:       16
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
 tracksRegLiveness: true
-hasWinCFI:       false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHCatchret:   false
-hasEHScopes:     false
-hasEHFunclets:   false
-isOutlined:      false
-debugInstrRef:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
-  - { id: 0, class: vsfrc, preferred-register: '' }
-  - { id: 1, class: vsfrc, preferred-register: '' }
-  - { id: 2, class: g8rc, preferred-register: '' }
-  - { id: 3, class: vsfrc, preferred-register: '' }
-  - { id: 4, class: vsfrc, preferred-register: '' }
-  - { id: 5, class: g8rc_and_g8rc_nox0, preferred-register: '' }
-  - { id: 6, class: g8rc, preferred-register: '' }
-  - { id: 7, class: vsfrc, preferred-register: '' }
-  - { id: 8, class: g8rc_and_g8rc_nox0, preferred-register: '' }
-  - { id: 9, class: g8rc_and_g8rc_nox0, preferred-register: '' }
-  - { id: 10, class: g8rc, preferred-register: '' }
-  - { id: 11, class: gprc, preferred-register: '' }
-  - { id: 12, class: vsfrc, preferred-register: '' }
-  - { id: 13, class: crrc, preferred-register: '' }
-  - { id: 14, class: vsfrc, preferred-register: '' }
-  - { id: 15, class: g8rc, preferred-register: '' }
-  - { id: 16, class: g8rc, preferred-register: '' }
-  - { id: 17, class: g8rc, preferred-register: '' }
-  - { id: 18, class: f8rc, preferred-register: '' }
-  - { id: 19, class: g8rc_and_g8rc_nox0, preferred-register: '' }
-  - { id: 20, class: vsfrc, preferred-register: '' }
-  - { id: 21, class: vsfrc, preferred-register: '' }
-  - { id: 22, class: vsfrc, preferred-register: '' }
-  - { id: 23, class: vsfrc, preferred-register: '' }
-  - { id: 24, class: vsfrc, preferred-register: '' }
-  - { id: 25, class: vsfrc, preferred-register: '' }
-  - { id: 26, class: vsfrc, preferred-register: '' }
-  - { id: 27, class: vsfrc, preferred-register: '' }
-  - { id: 28, class: vsfrc, preferred-register: '' }
-  - { id: 29, class: vsfrc, preferred-register: '' }
-  - { id: 30, class: vsfrc, preferred-register: '' }
-  - { id: 31, class: vsfrc, preferred-register: '' }
-  - { id: 32, class: vsfrc, preferred-register: '' }
-  - { id: 33, class: vsfrc, preferred-register: '' }
-  - { id: 34, class: vsfrc, preferred-register: '' }
-  - { id: 35, class: vsfrc, preferred-register: '' }
-  - { id: 36, class: vsfrc, preferred-register: '' }
-  - { id: 37, class: vsfrc, preferred-register: '' }
-  - { id: 38, class: vsfrc, preferred-register: '' }
-  - { id: 39, class: vsfrc, preferred-register: '' }
-  - { id: 40, class: vsfrc, preferred-register: '' }
-  - { id: 41, class: vsfrc, preferred-register: '' }
-  - { id: 42, class: vsfrc, preferred-register: '' }
-  - { id: 43, class: vsfrc, preferred-register: '' }
-  - { id: 44, class: vsfrc, preferred-register: '' }
-  - { id: 45, class: vsfrc, preferred-register: '' }
-  - { id: 46, class: vsfrc, preferred-register: '' }
-  - { id: 47, class: vsfrc, preferred-register: '' }
-  - { id: 48, class: vsfrc, preferred-register: '' }
-  - { id: 49, class: vsfrc, preferred-register: '' }
-  - { id: 50, class: vsfrc, preferred-register: '' }
-  - { id: 51, class: vsfrc, preferred-register: '' }
-  - { id: 52, class: vsfrc, preferred-register: '' }
-  - { id: 53, class: vsfrc, preferred-register: '' }
-  - { id: 54, class: vsfrc, preferred-register: '' }
-  - { id: 55, class: vsfrc, preferred-register: '' }
-  - { id: 56, class: vsfrc, preferred-register: '' }
-  - { id: 57, class: vsfrc, preferred-register: '' }
-  - { id: 58, class: vsfrc, preferred-register: '' }
-  - { id: 59, class: vsfrc, preferred-register: '' }
-  - { id: 60, class: vsfrc, preferred-register: '' }
-  - { id: 61, class: vsfrc, preferred-register: '' }
-  - { id: 62, class: crbitrc, preferred-register: '' }
 liveins:
   - { reg: '$x3', virtual-reg: '%8' }
   - { reg: '$x4', virtual-reg: '%9' }
   - { reg: '$x5', virtual-reg: '%10' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    1
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:      []
-stack:           []
-entry_values:    []
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo: {}
 body:             |
   bb.0.entry:
     successors: %bb.2(0x50000000), %bb.1(0x30000000)
@@ -251,16 +111,12 @@ body:             |
     BCC 44, killed %13, %bb.2
   
   bb.1:
-    successors: %bb.3(0x80000000)
-  
     %12:vsfrc = XXLXORdpz
     B %bb.3
   
   bb.2.for.body.preheader:
-    successors: %bb.4(0x80000000)
-  
-    %0:vsfrc = DFLOADf64 0, %8 :: (load (s64) from %ir.a, !tbaa !3)
-    %1:vsfrc = DFLOADf64 8, killed %8 :: (load (s64) from %ir.arrayidx1, !tbaa !3)
+    %0:vsfrc = DFLOADf64 0, %8 :: (load (s64) from %ir.a)
+    %1:vsfrc = DFLOADf64 8, killed %8 :: (load (s64) from %ir.arrayidx1)
     %16:g8rc = IMPLICIT_DEF
     %15:g8rc = INSERT_SUBREG killed %16, killed %11, %subreg.sub_32
     %17:g8rc = RLDICL killed %15, 0, 32
@@ -279,7 +135,7 @@ body:             |
   
     %4:vsfrc = PHI %14, %bb.2, %7, %bb.4
     %5:g8rc_and_g8rc_nox0 = PHI %2, %bb.2, %6, %bb.4
-    %18:f8rc, %19:g8rc_and_g8rc_nox0 = LFDU 8, killed %5 :: (load (s64) from %ir.3, !tbaa !3)
+    %18:f8rc, %19:g8rc_and_g8rc_nox0 = LFDU 8, killed %5 :: (load (s64) from %ir.3)
     %6:g8rc = COPY killed %19
     %20:vsfrc = nofpexcept XSMADDADP %0, %0, %18, implicit $rm
     %21:vsfrc = nofpexcept XSMADDADP %20, %20, %18, implicit $rm
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/anyext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/anyext.mir
new file mode 100644
index 0000000..eda1180
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/anyext.mir
@@ -0,0 +1,902 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=instruction-select -simplify-mir \
+# RUN:   -verify-machineinstrs %s -o - | FileCheck -check-prefix=RV32I %s
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=instruction-select -simplify-mir \
+# RUN:   -verify-machineinstrs %s -o - | FileCheck -check-prefix=RV64I %s
+
+---
+name:            anyext_nxv1i16_nxv1i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i16_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i16_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s8>) = COPY $v8
+    %1:vrb(<vscale x 1 x s16>) = G_ANYEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv1i32_nxv1i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i32_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i32_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s8>) = COPY $v8
+    %1:vrb(<vscale x 1 x s32>) = G_ANYEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv1i64_nxv1i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i64_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF8_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i64_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF8_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s8>) = COPY $v8
+    %1:vrb(<vscale x 1 x s64>) = G_ANYEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i16_nxv2i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i16_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i16_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s8>) = COPY $v8
+    %1:vrb(<vscale x 2 x s16>) = G_ANYEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i32_nxv2i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i32_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i32_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s8>) = COPY $v8
+    %1:vrb(<vscale x 2 x s32>) = G_ANYEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i64_nxv2i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i64_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF8_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i64_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF8_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 2 x s8>) = COPY $v8
+    %1:vrb(<vscale x 2 x s64>) = G_ANYEXT %0(<vscale x 2 x s8>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv4i16_nxv4i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i16_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i16_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 4 x s8>) = COPY $v8
+    %1:vrb(<vscale x 4 x s16>) = G_ANYEXT %0(<vscale x 4 x s8>)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv4i32_nxv4i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i32_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF4_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i32_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF4_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 4 x s8>) = COPY $v8
+    %1:vrb(<vscale x 4 x s32>) = G_ANYEXT %0(<vscale x 4 x s8>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv4i64_nxv4i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i64_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF8_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i64_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF8_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 4 x s8>) = COPY $v8
+    %1:vrb(<vscale x 4 x s64>) = G_ANYEXT %0(<vscale x 4 x s8>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv8i16_nxv8i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i16_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i16_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 8 x s8>) = COPY $v8
+    %1:vrb(<vscale x 8 x s16>) = G_ANYEXT %0(<vscale x 8 x s8>)
+    $v8m2 = COPY %1(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv8i32_nxv8i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i32_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF4_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i32_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF4_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 8 x s8>) = COPY $v8
+    %1:vrb(<vscale x 8 x s32>) = G_ANYEXT %0(<vscale x 8 x s8>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv8i64_nxv8i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i64_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF8_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i64_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF8_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 8 x s8>) = COPY $v8
+    %1:vrb(<vscale x 8 x s64>) = G_ANYEXT %0(<vscale x 8 x s8>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            anyext_nxv16i16_nxv16i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv16i16_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv16i16_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    %1:vrb(<vscale x 16 x s16>) = G_ANYEXT %0(<vscale x 16 x s8>)
+    $v8m4 = COPY %1(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv16i32_nxv16i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv16i32_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m4
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF4_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv16i32_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m4
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF4_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 16 x s8>) = COPY $v8m4
+    %1:vrb(<vscale x 16 x s32>) = G_ANYEXT %0(<vscale x 16 x s8>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            anyext_nxv32i16_nxv32i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv32i16_nxv32i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv32i16_nxv32i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 32 x s8>) = COPY $v8m4
+    %1:vrb(<vscale x 32 x s16>) = G_ANYEXT %0(<vscale x 32 x s8>)
+    $v8m8 = COPY %1(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            anyext_nxv1i32_nxv1i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i32_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i32_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s16>) = COPY $v8
+    %1:vrb(<vscale x 1 x s32>) = G_ANYEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv1i64_nxv1i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i64_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i64_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s16>) = COPY $v8
+    %1:vrb(<vscale x 1 x s64>) = G_ANYEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i32_nxv2i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i32_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i32_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s16>) = COPY $v8
+    %1:vrb(<vscale x 2 x s32>) = G_ANYEXT %0(<vscale x 2 x s16>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i64_nxv2i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i64_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF4_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i64_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF4_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 2 x s16>) = COPY $v8
+    %1:vrb(<vscale x 2 x s64>) = G_ANYEXT %0(<vscale x 2 x s16>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv4i32_nxv4i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i32_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i32_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 4 x s16>) = COPY $v8
+    %1:vrb(<vscale x 4 x s32>) = G_ANYEXT %0(<vscale x 4 x s16>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv4i64_nxv4i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i64_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF4_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i64_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF4_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 4 x s16>) = COPY $v8
+    %1:vrb(<vscale x 4 x s64>) = G_ANYEXT %0(<vscale x 4 x s16>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv8i32_nxv8i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i32_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i32_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    %1:vrb(<vscale x 8 x s32>) = G_ANYEXT %0(<vscale x 8 x s16>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv8i64_nxv8i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i64_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF4_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i64_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF4_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    %1:vrb(<vscale x 8 x s64>) = G_ANYEXT %0(<vscale x 8 x s16>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            anyext_nxv16i32_nxv16i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv16i32_nxv16i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv16i32_nxv16i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 16 x s16>) = COPY $v8m4
+    %1:vrb(<vscale x 16 x s32>) = G_ANYEXT %0(<vscale x 16 x s16>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            anyext_nxv1i64_nxv1i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i64_nxv1i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i64_nxv1i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s32>) = COPY $v8
+    %1:vrb(<vscale x 1 x s64>) = G_ANYEXT %0(<vscale x 1 x s32>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i64_nxv2i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i64_nxv2i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i64_nxv2i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 2 x s32>) = COPY $v8
+    %1:vrb(<vscale x 2 x s64>) = G_ANYEXT %0(<vscale x 2 x s32>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv4i64_nxv4i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i64_nxv4i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i64_nxv4i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 4 x s32>) = COPY $v8m2
+    %1:vrb(<vscale x 4 x s64>) = G_ANYEXT %0(<vscale x 4 x s32>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv8i64_nxv8i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i64_nxv8i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i64_nxv8i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 8 x s32>) = COPY $v8m4
+    %1:vrb(<vscale x 8 x s64>) = G_ANYEXT %0(<vscale x 8 x s32>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/icmp.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/icmp.mir
new file mode 100644
index 0000000..df0d48a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/icmp.mir
@@ -0,0 +1,534 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=instruction-select -simplify-mir -verify-machineinstrs %s -o - | FileCheck -check-prefix=RV32I %s
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=instruction-select -simplify-mir -verify-machineinstrs %s -o - | FileCheck -check-prefix=RV64I %s
+
+# Don't test i1 element types here since they have been widened to i8 in legalization
+
+---
+name:            icmp_nxv1i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv1i8
+    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: [[PseudoVMSLTU_VV_MF8_:%[0-9]+]]:vr = PseudoVMSLTU_VV_MF8 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV32I-NEXT: $v8 = COPY [[PseudoVMSLTU_VV_MF8_]]
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv1i8
+    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: [[PseudoVMSLTU_VV_MF8_:%[0-9]+]]:vr = PseudoVMSLTU_VV_MF8 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV64I-NEXT: $v8 = COPY [[PseudoVMSLTU_VV_MF8_]]
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 1 x s1>) = G_ICMP intpred(ult), %0(<vscale x 1 x s8>), %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv2i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv2i8
+    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: [[PseudoVMSLT_VV_MF4_:%[0-9]+]]:vr = PseudoVMSLT_VV_MF4 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV32I-NEXT: $v8 = COPY [[PseudoVMSLT_VV_MF4_]]
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv2i8
+    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: [[PseudoVMSLT_VV_MF4_:%[0-9]+]]:vr = PseudoVMSLT_VV_MF4 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV64I-NEXT: $v8 = COPY [[PseudoVMSLT_VV_MF4_]]
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 2 x s1>) = G_ICMP intpred(slt), %0(<vscale x 2 x s8>), %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv4i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv4i8
+    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: [[PseudoVMSLEU_VV_MF2_:%[0-9]+]]:vr = PseudoVMSLEU_VV_MF2 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV32I-NEXT: $v8 = COPY [[PseudoVMSLEU_VV_MF2_]]
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv4i8
+    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: [[PseudoVMSLEU_VV_MF2_:%[0-9]+]]:vr = PseudoVMSLEU_VV_MF2 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV64I-NEXT: $v8 = COPY [[PseudoVMSLEU_VV_MF2_]]
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 4 x s1>) = G_ICMP intpred(uge), %0(<vscale x 4 x s8>), %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv8i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv8i8
+    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: [[PseudoVMSLE_VV_M1_:%[0-9]+]]:vr = PseudoVMSLE_VV_M1 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV32I-NEXT: $v8 = COPY [[PseudoVMSLE_VV_M1_]]
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv8i8
+    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: [[PseudoVMSLE_VV_M1_:%[0-9]+]]:vr = PseudoVMSLE_VV_M1 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV64I-NEXT: $v8 = COPY [[PseudoVMSLE_VV_M1_]]
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 8 x s1>) = G_ICMP intpred(sge), %0(<vscale x 8 x s8>), %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv16i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv16i8
+    ; RV32I: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSLTU_VV_M2 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv16i8
+    ; RV64I: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSLTU_VV_M2 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 16 x s1>) = G_ICMP intpred(ugt), %0(<vscale x 16 x s8>), %0
+    $v8 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv32i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv32i8
+    ; RV32I: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSLT_VV_M4 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv32i8
+    ; RV64I: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSLT_VV_M4 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 32 x s8>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 32 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 32 x s8>), %0
+    $v8 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv64i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv64i8
+    ; RV32I: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSLEU_VV_M8 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv64i8
+    ; RV64I: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSLEU_VV_M8 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 64 x s8>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 64 x s1>) = G_ICMP intpred(ule), %0(<vscale x 64 x s8>), %0
+    $v8 = COPY %1(<vscale x 64 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv1i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv1i16
+    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: [[PseudoVMSLE_VV_MF4_:%[0-9]+]]:vr = PseudoVMSLE_VV_MF4 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV32I-NEXT: $v8 = COPY [[PseudoVMSLE_VV_MF4_]]
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv1i16
+    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: [[PseudoVMSLE_VV_MF4_:%[0-9]+]]:vr = PseudoVMSLE_VV_MF4 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV64I-NEXT: $v8 = COPY [[PseudoVMSLE_VV_MF4_]]
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sle), %0(<vscale x 1 x s16>), %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv2i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv2i16
+    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: [[PseudoVMSNE_VV_MF2_:%[0-9]+]]:vr = PseudoVMSNE_VV_MF2 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV32I-NEXT: $v8 = COPY [[PseudoVMSNE_VV_MF2_]]
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv2i16
+    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: [[PseudoVMSNE_VV_MF2_:%[0-9]+]]:vr = PseudoVMSNE_VV_MF2 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV64I-NEXT: $v8 = COPY [[PseudoVMSNE_VV_MF2_]]
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 2 x s1>) = G_ICMP intpred(ne), %0(<vscale x 2 x s16>), %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv4i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv4i16
+    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: [[PseudoVMSEQ_VV_M1_:%[0-9]+]]:vr = PseudoVMSEQ_VV_M1 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV32I-NEXT: $v8 = COPY [[PseudoVMSEQ_VV_M1_]]
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv4i16
+    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: [[PseudoVMSEQ_VV_M1_:%[0-9]+]]:vr = PseudoVMSEQ_VV_M1 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV64I-NEXT: $v8 = COPY [[PseudoVMSEQ_VV_M1_]]
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 4 x s1>) = G_ICMP intpred(eq), %0(<vscale x 4 x s16>), %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv8i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv8i16
+    ; RV32I: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSLTU_VV_M2 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv8i16
+    ; RV64I: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSLTU_VV_M2 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 8 x s1>) = G_ICMP intpred(ult), %0(<vscale x 8 x s16>), %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv16i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv16i16
+    ; RV32I: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSLT_VV_M4 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv16i16
+    ; RV64I: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSLT_VV_M4 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 16 x s1>) = G_ICMP intpred(slt), %0(<vscale x 16 x s16>), %0
+    $v8 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv32i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv32i16
+    ; RV32I: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSLEU_VV_M8 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv32i16
+    ; RV64I: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSLEU_VV_M8 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 32 x s16>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 32 x s1>) = G_ICMP intpred(uge), %0(<vscale x 32 x s16>), %0
+    $v8 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv1i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv1i32
+    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: [[PseudoVMSLE_VV_MF2_:%[0-9]+]]:vr = PseudoVMSLE_VV_MF2 [[DEF]], [[DEF]], -1, 5 /* e32 */
+    ; RV32I-NEXT: $v8 = COPY [[PseudoVMSLE_VV_MF2_]]
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv1i32
+    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: [[PseudoVMSLE_VV_MF2_:%[0-9]+]]:vr = PseudoVMSLE_VV_MF2 [[DEF]], [[DEF]], -1, 5 /* e32 */
+    ; RV64I-NEXT: $v8 = COPY [[PseudoVMSLE_VV_MF2_]]
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sge), %0(<vscale x 1 x s32>), %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv2i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv2i32
+    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: [[PseudoVMSLTU_VV_M1_:%[0-9]+]]:vr = PseudoVMSLTU_VV_M1 [[DEF]], [[DEF]], -1, 5 /* e32 */
+    ; RV32I-NEXT: $v8 = COPY [[PseudoVMSLTU_VV_M1_]]
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv2i32
+    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: [[PseudoVMSLTU_VV_M1_:%[0-9]+]]:vr = PseudoVMSLTU_VV_M1 [[DEF]], [[DEF]], -1, 5 /* e32 */
+    ; RV64I-NEXT: $v8 = COPY [[PseudoVMSLTU_VV_M1_]]
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 2 x s1>) = G_ICMP intpred(ugt), %0(<vscale x 2 x s32>), %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv4i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv4i32
+    ; RV32I: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSLT_VV_M2 [[DEF]], [[DEF]], -1, 5 /* e32 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv4i32
+    ; RV64I: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSLT_VV_M2 [[DEF]], [[DEF]], -1, 5 /* e32 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 4 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 4 x s32>), %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv8i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv8i32
+    ; RV32I: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSLEU_VV_M4 [[DEF]], [[DEF]], -1, 5 /* e32 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv8i32
+    ; RV64I: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSLEU_VV_M4 [[DEF]], [[DEF]], -1, 5 /* e32 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 8 x s1>) = G_ICMP intpred(ule), %0(<vscale x 8 x s32>), %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv16i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv16i32
+    ; RV32I: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSLE_VV_M8 [[DEF]], [[DEF]], -1, 5 /* e32 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv16i32
+    ; RV64I: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSLE_VV_M8 [[DEF]], [[DEF]], -1, 5 /* e32 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 16 x s1>) = G_ICMP intpred(sle), %0(<vscale x 16 x s32>), %0
+    $v8 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv1i64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv1i64
+    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: [[PseudoVMSEQ_VV_M1_:%[0-9]+]]:vr = PseudoVMSEQ_VV_M1 [[DEF]], [[DEF]], -1, 6 /* e64 */
+    ; RV32I-NEXT: $v8 = COPY [[PseudoVMSEQ_VV_M1_]]
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv1i64
+    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: [[PseudoVMSEQ_VV_M1_:%[0-9]+]]:vr = PseudoVMSEQ_VV_M1 [[DEF]], [[DEF]], -1, 6 /* e64 */
+    ; RV64I-NEXT: $v8 = COPY [[PseudoVMSEQ_VV_M1_]]
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 1 x s1>) = G_ICMP intpred(eq), %0(<vscale x 1 x s64>), %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv2i64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv2i64
+    ; RV32I: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSNE_VV_M2 [[DEF]], [[DEF]], -1, 6 /* e64 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv2i64
+    ; RV64I: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSNE_VV_M2 [[DEF]], [[DEF]], -1, 6 /* e64 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 2 x s1>) = G_ICMP intpred(ne), %0(<vscale x 2 x s64>), %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv4i64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv4i64
+    ; RV32I: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSLTU_VV_M4 [[DEF]], [[DEF]], -1, 6 /* e64 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv4i64
+    ; RV64I: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSLTU_VV_M4 [[DEF]], [[DEF]], -1, 6 /* e64 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 4 x s1>) = G_ICMP intpred(ult), %0(<vscale x 4 x s64>), %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv8i64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv8i64
+    ; RV32I: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSLTU_VV_M8 [[DEF]], [[DEF]], -1, 6 /* e64 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv8i64
+    ; RV64I: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSLTU_VV_M8 [[DEF]], [[DEF]], -1, 6 /* e64 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 8 x s1>) = G_ICMP intpred(ult), %0(<vscale x 8 x s64>), %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/sext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/sext.mir
new file mode 100644
index 0000000..382166f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/sext.mir
@@ -0,0 +1,900 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=instruction-select -simplify-mir -verify-machineinstrs %s -o - | FileCheck -check-prefix=RV32I %s
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=instruction-select -simplify-mir -verify-machineinstrs %s -o - | FileCheck -check-prefix=RV64I %s
+
+---
+name:            sext_nxv1i16_nxv1i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i16_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_MF4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i16_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_MF4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s8>) = COPY $v8
+    %1:vrb(<vscale x 1 x s16>) = G_SEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv1i32_nxv1i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i32_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF4_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i32_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF4_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s8>) = COPY $v8
+    %1:vrb(<vscale x 1 x s32>) = G_SEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv1i64_nxv1i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i64_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF8_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i64_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF8_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s8>) = COPY $v8
+    %1:vrb(<vscale x 1 x s64>) = G_SEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i16_nxv2i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i16_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i16_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s8>) = COPY $v8
+    %1:vrb(<vscale x 2 x s16>) = G_SEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i32_nxv2i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i32_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF4_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i32_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF4_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s8>) = COPY $v8
+    %1:vrb(<vscale x 2 x s32>) = G_SEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i64_nxv2i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i64_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF8_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i64_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF8_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 2 x s8>) = COPY $v8
+    %1:vrb(<vscale x 2 x s64>) = G_SEXT %0(<vscale x 2 x s8>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv4i16_nxv4i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i16_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_M1 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i16_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_M1 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 4 x s8>) = COPY $v8
+    %1:vrb(<vscale x 4 x s16>) = G_SEXT %0(<vscale x 4 x s8>)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv4i32_nxv4i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i32_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF4_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i32_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF4_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 4 x s8>) = COPY $v8
+    %1:vrb(<vscale x 4 x s32>) = G_SEXT %0(<vscale x 4 x s8>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv4i64_nxv4i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i64_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF8_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i64_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF8_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 4 x s8>) = COPY $v8
+    %1:vrb(<vscale x 4 x s64>) = G_SEXT %0(<vscale x 4 x s8>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv8i16_nxv8i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i16_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF2_M2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i16_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF2_M2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 8 x s8>) = COPY $v8
+    %1:vrb(<vscale x 8 x s16>) = G_SEXT %0(<vscale x 8 x s8>)
+    $v8m2 = COPY %1(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv8i32_nxv8i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i32_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF4_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i32_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF4_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 8 x s8>) = COPY $v8
+    %1:vrb(<vscale x 8 x s32>) = G_SEXT %0(<vscale x 8 x s8>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv8i64_nxv8i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i64_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF8_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i64_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF8_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 8 x s8>) = COPY $v8
+    %1:vrb(<vscale x 8 x s64>) = G_SEXT %0(<vscale x 8 x s8>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            sext_nxv16i16_nxv16i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv16i16_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF2_M4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv16i16_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF2_M4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    %1:vrb(<vscale x 16 x s16>) = G_SEXT %0(<vscale x 16 x s8>)
+    $v8m4 = COPY %1(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv16i32_nxv16i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv16i32_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF4_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv16i32_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF4_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    %1:vrb(<vscale x 16 x s32>) = G_SEXT %0(<vscale x 16 x s8>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            sext_nxv32i16_nxv32i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv32i16_nxv32i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF2_M8 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv32i16_nxv32i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF2_M8 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 32 x s8>) = COPY $v8m4
+    %1:vrb(<vscale x 32 x s16>) = G_SEXT %0(<vscale x 32 x s8>)
+    $v8m8 = COPY %1(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            sext_nxv1i32_nxv1i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i32_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i32_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s16>) = COPY $v8
+    %1:vrb(<vscale x 1 x s32>) = G_SEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv1i64_nxv1i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i64_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF4_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i64_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF4_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s16>) = COPY $v8
+    %1:vrb(<vscale x 1 x s64>) = G_SEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i32_nxv2i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i32_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i32_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s16>) = COPY $v8
+    %1:vrb(<vscale x 2 x s32>) = G_SEXT %0(<vscale x 2 x s16>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i64_nxv2i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i64_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF4_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i64_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF4_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 2 x s16>) = COPY $v8
+    %1:vrb(<vscale x 2 x s64>) = G_SEXT %0(<vscale x 2 x s16>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv4i32_nxv4i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i32_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF2_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i32_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF2_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 4 x s16>) = COPY $v8
+    %1:vrb(<vscale x 4 x s32>) = G_SEXT %0(<vscale x 4 x s16>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv4i64_nxv4i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i64_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF4_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i64_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF4_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 4 x s16>) = COPY $v8
+    %1:vrb(<vscale x 4 x s64>) = G_SEXT %0(<vscale x 4 x s16>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv8i32_nxv8i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i32_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF2_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i32_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF2_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    %1:vrb(<vscale x 8 x s32>) = G_SEXT %0(<vscale x 8 x s16>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv8i64_nxv8i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i64_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF4_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i64_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF4_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    %1:vrb(<vscale x 8 x s64>) = G_SEXT %0(<vscale x 8 x s16>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            sext_nxv16i32_nxv16i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv16i32_nxv16i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF2_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv16i32_nxv16i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF2_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 16 x s16>) = COPY $v8m4
+    %1:vrb(<vscale x 16 x s32>) = G_SEXT %0(<vscale x 16 x s16>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            sext_nxv1i64_nxv1i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i64_nxv1i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i64_nxv1i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s32>) = COPY $v8
+    %1:vrb(<vscale x 1 x s64>) = G_SEXT %0(<vscale x 1 x s32>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i64_nxv2i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i64_nxv2i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF2_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i64_nxv2i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF2_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 2 x s32>) = COPY $v8
+    %1:vrb(<vscale x 2 x s64>) = G_SEXT %0(<vscale x 2 x s32>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv4i64_nxv4i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i64_nxv4i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF2_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i64_nxv4i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF2_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 4 x s32>) = COPY $v8m2
+    %1:vrb(<vscale x 4 x s64>) = G_SEXT %0(<vscale x 4 x s32>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv8i64_nxv8i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i64_nxv8i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF2_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i64_nxv8i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF2_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 8 x s32>) = COPY $v8m4
+    %1:vrb(<vscale x 8 x s64>) = G_SEXT %0(<vscale x 8 x s32>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/zext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/zext.mir
new file mode 100644
index 0000000..2fc9e05
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/zext.mir
@@ -0,0 +1,900 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=instruction-select -simplify-mir -verify-machineinstrs %s -o - | FileCheck -check-prefix=RV32I %s
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=instruction-select -simplify-mir -verify-machineinstrs %s -o - | FileCheck -check-prefix=RV64I %s
+
+---
+name:            zext_nxv1i16_nxv1i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i16_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i16_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s8>) = COPY $v8
+    %1:vrb(<vscale x 1 x s16>) = G_ZEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv1i32_nxv1i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i32_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i32_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s8>) = COPY $v8
+    %1:vrb(<vscale x 1 x s32>) = G_ZEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv1i64_nxv1i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i64_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF8_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i64_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF8_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s8>) = COPY $v8
+    %1:vrb(<vscale x 1 x s64>) = G_ZEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i16_nxv2i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i16_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i16_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s8>) = COPY $v8
+    %1:vrb(<vscale x 2 x s16>) = G_ZEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i32_nxv2i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i32_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i32_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s8>) = COPY $v8
+    %1:vrb(<vscale x 2 x s32>) = G_ZEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i64_nxv2i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i64_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF8_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i64_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF8_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 2 x s8>) = COPY $v8
+    %1:vrb(<vscale x 2 x s64>) = G_ZEXT %0(<vscale x 2 x s8>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv4i16_nxv4i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i16_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i16_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 4 x s8>) = COPY $v8
+    %1:vrb(<vscale x 4 x s16>) = G_ZEXT %0(<vscale x 4 x s8>)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv4i32_nxv4i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i32_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF4_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i32_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF4_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 4 x s8>) = COPY $v8
+    %1:vrb(<vscale x 4 x s32>) = G_ZEXT %0(<vscale x 4 x s8>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv4i64_nxv4i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i64_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF8_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i64_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF8_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 4 x s8>) = COPY $v8
+    %1:vrb(<vscale x 4 x s64>) = G_ZEXT %0(<vscale x 4 x s8>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv8i16_nxv8i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i16_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i16_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 8 x s8>) = COPY $v8
+    %1:vrb(<vscale x 8 x s16>) = G_ZEXT %0(<vscale x 8 x s8>)
+    $v8m2 = COPY %1(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv8i32_nxv8i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i32_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF4_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i32_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF4_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 8 x s8>) = COPY $v8
+    %1:vrb(<vscale x 8 x s32>) = G_ZEXT %0(<vscale x 8 x s8>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv8i64_nxv8i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i64_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF8_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i64_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF8_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 8 x s8>) = COPY $v8
+    %1:vrb(<vscale x 8 x s64>) = G_ZEXT %0(<vscale x 8 x s8>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            zext_nxv16i16_nxv16i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv16i16_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv16i16_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    %1:vrb(<vscale x 16 x s16>) = G_ZEXT %0(<vscale x 16 x s8>)
+    $v8m4 = COPY %1(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv16i32_nxv16i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv16i32_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF4_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv16i32_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF4_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    %1:vrb(<vscale x 16 x s32>) = G_ZEXT %0(<vscale x 16 x s8>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            zext_nxv32i16_nxv32i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv32i16_nxv32i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv32i16_nxv32i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 32 x s8>) = COPY $v8m4
+    %1:vrb(<vscale x 32 x s16>) = G_ZEXT %0(<vscale x 32 x s8>)
+    $v8m8 = COPY %1(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            zext_nxv1i32_nxv1i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i32_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i32_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s16>) = COPY $v8
+    %1:vrb(<vscale x 1 x s32>) = G_ZEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv1i64_nxv1i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i64_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i64_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s16>) = COPY $v8
+    %1:vrb(<vscale x 1 x s64>) = G_ZEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i32_nxv2i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i32_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i32_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s16>) = COPY $v8
+    %1:vrb(<vscale x 2 x s32>) = G_ZEXT %0(<vscale x 2 x s16>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i64_nxv2i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i64_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF4_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i64_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF4_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 2 x s16>) = COPY $v8
+    %1:vrb(<vscale x 2 x s64>) = G_ZEXT %0(<vscale x 2 x s16>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv4i32_nxv4i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i32_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i32_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 4 x s16>) = COPY $v8
+    %1:vrb(<vscale x 4 x s32>) = G_ZEXT %0(<vscale x 4 x s16>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv4i64_nxv4i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i64_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF4_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i64_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF4_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 4 x s16>) = COPY $v8
+    %1:vrb(<vscale x 4 x s64>) = G_ZEXT %0(<vscale x 4 x s16>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv8i32_nxv8i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i32_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i32_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    %1:vrb(<vscale x 8 x s32>) = G_ZEXT %0(<vscale x 8 x s16>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv8i64_nxv8i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i64_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m4
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF4_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i64_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m4
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF4_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 8 x s16>) = COPY $v8m4
+    %1:vrb(<vscale x 8 x s64>) = G_ZEXT %0(<vscale x 8 x s16>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            zext_nxv16i32_nxv16i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv16i32_nxv16i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv16i32_nxv16i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 16 x s16>) = COPY $v8m4
+    %1:vrb(<vscale x 16 x s32>) = G_ZEXT %0(<vscale x 16 x s16>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            zext_nxv1i64_nxv1i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i64_nxv1i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i64_nxv1i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s32>) = COPY $v8
+    %1:vrb(<vscale x 1 x s64>) = G_ZEXT %0(<vscale x 1 x s32>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i64_nxv2i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i64_nxv2i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i64_nxv2i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 2 x s32>) = COPY $v8
+    %1:vrb(<vscale x 2 x s64>) = G_ZEXT %0(<vscale x 2 x s32>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv4i64_nxv4i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i64_nxv4i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i64_nxv4i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 4 x s32>) = COPY $v8m2
+    %1:vrb(<vscale x 4 x s64>) = G_ZEXT %0(<vscale x 4 x s32>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv8i64_nxv8i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i64_nxv8i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i64_nxv8i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 8 x s32>) = COPY $v8m4
+    %1:vrb(<vscale x 8 x s64>) = G_ZEXT %0(<vscale x 8 x s32>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-anyext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-anyext.mir
new file mode 100644
index 0000000..3a2d40f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-anyext.mir
@@ -0,0 +1,1589 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck --check-prefix=RV32 %s
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=legalizer %s -o - | FileCheck --check-prefix=RV64 %s
+
+# Extend from s1 element vectors
+---
+name:            anyext_nxv1i8_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv1i8_nxv1i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv1i8_nxv1i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v0
+    %0:_(<vscale x 1 x s8>) = G_ANYEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv1i16_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv1i16_nxv1i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv1i16_nxv1i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v0
+    %0:_(<vscale x 1 x s16>) = G_ANYEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv1i32_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv1i32_nxv1i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv1i32_nxv1i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v0
+    %0:_(<vscale x 1 x s32>) = G_ANYEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv1i64_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv1i64_nxv1i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv1i64_nxv1i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v0
+    %0:_(<vscale x 1 x s64>) = G_ANYEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv2i8_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv2i8_nxv2i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv2i8_nxv2i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s1>) = COPY $v0
+    %0:_(<vscale x 2 x s8>) = G_ANYEXT %1(<vscale x 2 x s1>)
+    $v8 = COPY %0(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv2i16_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv2i16_nxv2i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv2i16_nxv2i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s1>) = COPY $v0
+    %0:_(<vscale x 2 x s16>) = G_ANYEXT %1(<vscale x 2 x s1>)
+    $v8 = COPY %0(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv2i32_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv2i32_nxv2i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv2i32_nxv2i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s1>) = COPY $v0
+    %0:_(<vscale x 2 x s32>) = G_ANYEXT %1(<vscale x 2 x s1>)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv2i64_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv2i64_nxv2i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: anyext_nxv2i64_nxv2i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s1>) = COPY $v0
+    %0:_(<vscale x 2 x s64>) = G_ANYEXT %1(<vscale x 2 x s1>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            anyext_nxv4i8_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv4i8_nxv4i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv4i8_nxv4i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s1>) = COPY $v0
+    %0:_(<vscale x 4 x s8>) = G_ANYEXT %1(<vscale x 4 x s1>)
+    $v8 = COPY %0(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv4i16_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv4i16_nxv4i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv4i16_nxv4i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s1>) = COPY $v0
+    %0:_(<vscale x 4 x s16>) = G_ANYEXT %1(<vscale x 4 x s1>)
+    $v8 = COPY %0(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv4i32_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv4i32_nxv4i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 4 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: anyext_nxv4i32_nxv4i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 4 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s1>) = COPY $v0
+    %0:_(<vscale x 4 x s32>) = G_ANYEXT %1(<vscale x 4 x s1>)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            anyext_nxv4i64_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv4i64_nxv4i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: anyext_nxv4i64_nxv4i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s1>) = COPY $v0
+    %0:_(<vscale x 4 x s64>) = G_ANYEXT %1(<vscale x 4 x s1>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            anyext_nxv8i8_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv8i8_nxv8i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 8 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv8i8_nxv8i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 8 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 8 x s1>) = COPY $v0
+    %0:_(<vscale x 8 x s8>) = G_ANYEXT %1(<vscale x 8 x s1>)
+    $v8 = COPY %0(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv8i16_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv8i16_nxv8i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 8 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: anyext_nxv8i16_nxv8i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 8 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 8 x s1>) = COPY $v0
+    %0:_(<vscale x 8 x s16>) = G_ANYEXT %1(<vscale x 8 x s1>)
+    $v8m2 = COPY %0(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            anyext_nxv8i32_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv8i32_nxv8i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 8 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: anyext_nxv8i32_nxv8i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 8 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s1>) = COPY $v0
+    %0:_(<vscale x 8 x s32>) = G_ANYEXT %1(<vscale x 8 x s1>)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            anyext_nxv8i64_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv8i64_nxv8i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: anyext_nxv8i64_nxv8i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s1>) = COPY $v0
+    %0:_(<vscale x 8 x s64>) = G_ANYEXT %1(<vscale x 8 x s1>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            anyext_nxv16i8_nxv16i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv16i8_nxv16i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 16 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: anyext_nxv16i8_nxv16i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 16 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 16 x s1>) = COPY $v0
+    %0:_(<vscale x 16 x s8>) = G_ANYEXT %1(<vscale x 16 x s1>)
+    $v8m2 = COPY %0(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            anyext_nxv16i16_nxv16i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv16i16_nxv16i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 16 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: anyext_nxv16i16_nxv16i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 16 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 16 x s1>) = COPY $v0
+    %0:_(<vscale x 16 x s16>) = G_ANYEXT %1(<vscale x 16 x s1>)
+    $v8m4 = COPY %0(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            anyext_nxv16i32_nxv16i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv16i32_nxv16i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 16 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: anyext_nxv16i32_nxv16i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 16 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s1>) = COPY $v0
+    %0:_(<vscale x 16 x s32>) = G_ANYEXT %1(<vscale x 16 x s1>)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            anyext_nxv32i8_nxv32i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv32i8_nxv32i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 32 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: anyext_nxv32i8_nxv32i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 32 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 32 x s1>) = COPY $v0
+    %0:_(<vscale x 32 x s8>) = G_ANYEXT %1(<vscale x 32 x s1>)
+    $v8m4 = COPY %0(<vscale x 32 x s8>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            anyext_nxv32i16_nxv32i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv32i16_nxv32i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 32 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: anyext_nxv32i16_nxv32i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 32 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 32 x s1>) = COPY $v0
+    %0:_(<vscale x 32 x s16>) = G_ANYEXT %1(<vscale x 32 x s1>)
+    $v8m8 = COPY %0(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            anyext_nxv64i8_nxv64i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv64i8_nxv64i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 64 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SELECT [[COPY]](<vscale x 64 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 64 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: anyext_nxv64i8_nxv64i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 64 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SELECT [[COPY]](<vscale x 64 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 64 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 64 x s1>) = COPY $v0
+    %0:_(<vscale x 64 x s8>) = G_ANYEXT %1(<vscale x 64 x s1>)
+    $v8m8 = COPY %0(<vscale x 64 x s8>)
+    PseudoRET implicit $v8m8
+...
+
+# Extend from s8 element vectors
+---
+name:            anyext_nxv1i16_nxv1i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv1i16_nxv1i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv1i16_nxv1i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = COPY $v8
+    %0:_(<vscale x 1 x s16>) = G_ANYEXT %1(<vscale x 1 x s8>)
+    $v8 = COPY %0(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv1i32_nxv1i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv1i32_nxv1i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv1i32_nxv1i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = COPY $v8
+    %0:_(<vscale x 1 x s32>) = G_ANYEXT %1(<vscale x 1 x s8>)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv1i64_nxv1i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv1i64_nxv1i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv1i64_nxv1i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = COPY $v8
+    %0:_(<vscale x 1 x s64>) = G_ANYEXT %1(<vscale x 1 x s8>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv2i16_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv2i16_nxv2i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv2i16_nxv2i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s8>) = COPY $v8
+    %0:_(<vscale x 2 x s16>) = G_ANYEXT %1(<vscale x 2 x s8>)
+    $v8 = COPY %0(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv2i32_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv2i32_nxv2i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv2i32_nxv2i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s8>) = COPY $v8
+    %0:_(<vscale x 2 x s32>) = G_ANYEXT %1(<vscale x 2 x s8>)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv2i64_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv2i64_nxv2i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: anyext_nxv2i64_nxv2i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s8>) = COPY $v8
+    %0:_(<vscale x 2 x s64>) = G_ANYEXT %1(<vscale x 2 x s8>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            anyext_nxv4i16_nxv4i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv4i16_nxv4i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 4 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv4i16_nxv4i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 4 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s8>) = COPY $v8
+    %0:_(<vscale x 4 x s16>) = G_ANYEXT %1(<vscale x 4 x s8>)
+    $v8 = COPY %0(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv4i32_nxv4i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv4i32_nxv4i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 4 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: anyext_nxv4i32_nxv4i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 4 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s8>) = COPY $v8
+    %0:_(<vscale x 4 x s32>) = G_ANYEXT %1(<vscale x 4 x s8>)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            anyext_nxv4i64_nxv4i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv4i64_nxv4i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: anyext_nxv4i64_nxv4i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s8>) = COPY $v8
+    %0:_(<vscale x 4 x s64>) = G_ANYEXT %1(<vscale x 4 x s8>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            anyext_nxv8i16_nxv8i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv8i16_nxv8i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 8 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: anyext_nxv8i16_nxv8i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 8 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 8 x s8>) = COPY $v8
+    %0:_(<vscale x 8 x s16>) = G_ANYEXT %1(<vscale x 8 x s8>)
+    $v8m2 = COPY %0(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            anyext_nxv8i32_nxv8i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv8i32_nxv8i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 8 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: anyext_nxv8i32_nxv8i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 8 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s8>) = COPY $v8
+    %0:_(<vscale x 8 x s32>) = G_ANYEXT %1(<vscale x 8 x s8>)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            anyext_nxv8i64_nxv8i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv8i64_nxv8i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: anyext_nxv8i64_nxv8i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s8>) = COPY $v8
+    %0:_(<vscale x 8 x s64>) = G_ANYEXT %1(<vscale x 8 x s8>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            anyext_nxv16i16_nxv16i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv16i16_nxv16i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_ANYEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 16 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: anyext_nxv16i16_nxv16i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_ANYEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 16 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    %0:_(<vscale x 16 x s16>) = G_ANYEXT %1(<vscale x 16 x s8>)
+    $v8m4 = COPY %0(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            anyext_nxv16i32_nxv16i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv16i32_nxv16i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m4
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_ANYEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 16 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: anyext_nxv16i32_nxv16i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m4
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_ANYEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 16 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s8>) = COPY $v8m4
+    %0:_(<vscale x 16 x s32>) = G_ANYEXT %1(<vscale x 16 x s8>)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            anyext_nxv32i16_nxv32i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv32i16_nxv32i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_ANYEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV32-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 32 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: anyext_nxv32i16_nxv32i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_ANYEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV64-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 32 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 32 x s8>) = COPY $v8m4
+    %0:_(<vscale x 32 x s16>) = G_ANYEXT %1(<vscale x 32 x s8>)
+    $v8m8 = COPY %0(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+...
+
+# Extend from s16 element vectors
+---
+name:            anyext_nxv1i32_nxv1i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv1i32_nxv1i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_ANYEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv1i32_nxv1i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_ANYEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s16>) = COPY $v8
+    %0:_(<vscale x 1 x s32>) = G_ANYEXT %1(<vscale x 1 x s16>)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv1i64_nxv1i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv1i64_nxv1i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv1i64_nxv1i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s16>) = COPY $v8
+    %0:_(<vscale x 1 x s64>) = G_ANYEXT %1(<vscale x 1 x s16>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv2i32_nxv2i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv2i32_nxv2i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_ANYEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv2i32_nxv2i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_ANYEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s16>) = COPY $v8
+    %0:_(<vscale x 2 x s32>) = G_ANYEXT %1(<vscale x 2 x s16>)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv2i64_nxv2i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv2i64_nxv2i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: anyext_nxv2i64_nxv2i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s16>) = COPY $v8
+    %0:_(<vscale x 2 x s64>) = G_ANYEXT %1(<vscale x 2 x s16>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            anyext_nxv4i32_nxv4i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv4i32_nxv4i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_ANYEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 4 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: anyext_nxv4i32_nxv4i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_ANYEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 4 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    %0:_(<vscale x 4 x s32>) = G_ANYEXT %1(<vscale x 4 x s16>)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            anyext_nxv4i64_nxv4i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv4i64_nxv4i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: anyext_nxv4i64_nxv4i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    %0:_(<vscale x 4 x s64>) = G_ANYEXT %1(<vscale x 4 x s16>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            anyext_nxv8i32_nxv8i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv8i32_nxv8i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_ANYEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 8 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: anyext_nxv8i32_nxv8i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_ANYEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 8 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s16>) = COPY $v8m2
+    %0:_(<vscale x 8 x s32>) = G_ANYEXT %1(<vscale x 8 x s16>)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            anyext_nxv8i64_nxv8i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv8i64_nxv8i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: anyext_nxv8i64_nxv8i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s16>) = COPY $v8m2
+    %0:_(<vscale x 8 x s64>) = G_ANYEXT %1(<vscale x 8 x s16>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            anyext_nxv16i32_nxv16i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv16i32_nxv16i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_ANYEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV32-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 16 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: anyext_nxv16i32_nxv16i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_ANYEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV64-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 16 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s16>) = COPY $v8m4
+    %0:_(<vscale x 16 x s32>) = G_ANYEXT %1(<vscale x 16 x s16>)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+
+# Extend from s32 element vectors
+---
+name:            anyext_nxv1i64_nxv1i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv1i64_nxv1i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s32>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV32-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv1i64_nxv1i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s32>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV64-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s32>) = COPY $v8
+    %0:_(<vscale x 1 x s64>) = G_ANYEXT %1(<vscale x 1 x s32>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv2i64_nxv2i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv2i64_nxv2i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV32-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: anyext_nxv2i64_nxv2i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV64-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    %0:_(<vscale x 2 x s64>) = G_ANYEXT %1(<vscale x 2 x s32>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            anyext_nxv4i64_nxv4i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv4i64_nxv4i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV32-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: anyext_nxv4i64_nxv4i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV64-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s32>) = COPY $v8m2
+    %0:_(<vscale x 4 x s64>) = G_ANYEXT %1(<vscale x 4 x s32>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            anyext_nxv8i64_nxv8i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv8i64_nxv8i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV32-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: anyext_nxv8i64_nxv8i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV64-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s32>) = COPY $v8m4
+    %0:_(<vscale x 8 x s64>) = G_ANYEXT %1(<vscale x 8 x s32>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-icmp.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-icmp.mir
new file mode 100644
index 0000000..d1df954
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-icmp.mir
@@ -0,0 +1,810 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck --check-prefix=RV32 %s
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=legalizer %s -o - | FileCheck --check-prefix=RV64 %s
+
+---
+name:            icmp_nxv1i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv1i1
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[DEF]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
+    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[DEF]](<vscale x 1 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 1 x s8>), [[SELECT1]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv1i1
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[DEF]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT2]](s64)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT3]](s64)
+    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[DEF]](<vscale x 1 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 1 x s8>), [[SELECT1]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv2i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv2i1
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
+    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF]](<vscale x 2 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 2 x s8>), [[SELECT1]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv2i1
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT2]](s64)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT3]](s64)
+    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF]](<vscale x 2 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 2 x s8>), [[SELECT1]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv4i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv4i1
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[DEF]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
+    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[DEF]](<vscale x 4 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 4 x s8>), [[SELECT1]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv4i1
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[DEF]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT2]](s64)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT3]](s64)
+    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[DEF]](<vscale x 4 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 4 x s8>), [[SELECT1]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv8i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv8i1
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[DEF]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
+    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[DEF]](<vscale x 8 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 8 x s8>), [[SELECT1]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv8i1
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[DEF]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT2]](s64)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT3]](s64)
+    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[DEF]](<vscale x 8 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 8 x s8>), [[SELECT1]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv16i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv16i1
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SELECT [[DEF]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
+    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SELECT [[DEF]](<vscale x 16 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 16 x s8>), [[SELECT1]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv16i1
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SELECT [[DEF]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT2]](s64)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT3]](s64)
+    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SELECT [[DEF]](<vscale x 16 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 16 x s8>), [[SELECT1]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv32i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv32i1
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 32 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SELECT [[DEF]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
+    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SELECT [[DEF]](<vscale x 32 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 32 x s8>), [[SELECT1]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv32i1
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 32 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SELECT [[DEF]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT2]](s64)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT3]](s64)
+    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SELECT [[DEF]](<vscale x 32 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 32 x s8>), [[SELECT1]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 32 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv64i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv64i1
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SELECT [[DEF]](<vscale x 64 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
+    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SELECT [[DEF]](<vscale x 64 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 64 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 64 x s8>), [[SELECT1]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 64 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv64i1
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SELECT [[DEF]](<vscale x 64 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT2]](s64)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT3]](s64)
+    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SELECT [[DEF]](<vscale x 64 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 64 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 64 x s8>), [[SELECT1]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 64 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 64 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 64 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv1i8
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s8>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv1i8
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s8>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv2i8
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s8>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv2i8
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s8>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv4i8
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s8>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv4i8
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s8>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv8i8
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s8>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv8i8
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s8>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv16i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv16i8
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s8>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv16i8
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s8>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv32i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv32i8
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 32 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 32 x s8>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv32i8
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 32 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 32 x s8>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 32 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv64i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv64i8
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 64 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 64 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 64 x s8>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 64 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv64i8
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 64 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 64 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 64 x s8>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 64 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 64 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 64 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 64 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv1i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv1i16
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s16>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv1i16
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s16>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv2i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv2i16
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s16>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv2i16
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s16>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv4i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv4i16
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s16>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv4i16
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s16>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv8i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv8i16
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s16>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv8i16
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s16>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv16i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv16i16
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s16>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv16i16
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s16>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv32i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv32i16
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 32 x s16>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 32 x s16>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv32i16
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 32 x s16>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 32 x s16>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 32 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv1i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv1i32
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s32>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv1i32
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s32>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv2i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv2i32
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s32>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv2i32
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s32>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv4i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv4i32
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s32>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv4i32
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s32>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv8i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv8i32
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s32>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv8i32
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s32>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv16i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv16i32
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s32>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv16i32
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s32>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv1i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv1i64
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s64>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv1i64
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s64>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv2i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv2i64
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s64>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv2i64
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s64>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv4i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv4i64
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s64>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv4i64
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s64>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv8i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv8i64
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s64>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv8i64
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s64>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-sext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-sext.mir
new file mode 100644
index 0000000..1571daf
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-sext.mir
@@ -0,0 +1,1589 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck --check-prefix=RV32 %s
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=legalizer %s -o - | FileCheck --check-prefix=RV64 %s
+
+# Extend from s1 element vectors
+---
+name:            sext_nxv1i8_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv1i8_nxv1i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv1i8_nxv1i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v0
+    %0:_(<vscale x 1 x s8>) = G_SEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv1i16_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv1i16_nxv1i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv1i16_nxv1i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v0
+    %0:_(<vscale x 1 x s16>) = G_SEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv1i32_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv1i32_nxv1i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv1i32_nxv1i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v0
+    %0:_(<vscale x 1 x s32>) = G_SEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv1i64_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv1i64_nxv1i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv1i64_nxv1i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v0
+    %0:_(<vscale x 1 x s64>) = G_SEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv2i8_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv2i8_nxv2i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv2i8_nxv2i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s1>) = COPY $v0
+    %0:_(<vscale x 2 x s8>) = G_SEXT %1(<vscale x 2 x s1>)
+    $v8 = COPY %0(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv2i16_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv2i16_nxv2i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv2i16_nxv2i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s1>) = COPY $v0
+    %0:_(<vscale x 2 x s16>) = G_SEXT %1(<vscale x 2 x s1>)
+    $v8 = COPY %0(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv2i32_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv2i32_nxv2i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv2i32_nxv2i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s1>) = COPY $v0
+    %0:_(<vscale x 2 x s32>) = G_SEXT %1(<vscale x 2 x s1>)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv2i64_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv2i64_nxv2i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: sext_nxv2i64_nxv2i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s1>) = COPY $v0
+    %0:_(<vscale x 2 x s64>) = G_SEXT %1(<vscale x 2 x s1>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            sext_nxv4i8_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv4i8_nxv4i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv4i8_nxv4i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s1>) = COPY $v0
+    %0:_(<vscale x 4 x s8>) = G_SEXT %1(<vscale x 4 x s1>)
+    $v8 = COPY %0(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv4i16_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv4i16_nxv4i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv4i16_nxv4i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s1>) = COPY $v0
+    %0:_(<vscale x 4 x s16>) = G_SEXT %1(<vscale x 4 x s1>)
+    $v8 = COPY %0(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv4i32_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv4i32_nxv4i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 4 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: sext_nxv4i32_nxv4i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 4 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s1>) = COPY $v0
+    %0:_(<vscale x 4 x s32>) = G_SEXT %1(<vscale x 4 x s1>)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            sext_nxv4i64_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv4i64_nxv4i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: sext_nxv4i64_nxv4i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s1>) = COPY $v0
+    %0:_(<vscale x 4 x s64>) = G_SEXT %1(<vscale x 4 x s1>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            sext_nxv8i8_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv8i8_nxv8i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 8 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv8i8_nxv8i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 8 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 8 x s1>) = COPY $v0
+    %0:_(<vscale x 8 x s8>) = G_SEXT %1(<vscale x 8 x s1>)
+    $v8 = COPY %0(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv8i16_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv8i16_nxv8i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 8 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: sext_nxv8i16_nxv8i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 8 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 8 x s1>) = COPY $v0
+    %0:_(<vscale x 8 x s16>) = G_SEXT %1(<vscale x 8 x s1>)
+    $v8m2 = COPY %0(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            sext_nxv8i32_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv8i32_nxv8i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 8 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: sext_nxv8i32_nxv8i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 8 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s1>) = COPY $v0
+    %0:_(<vscale x 8 x s32>) = G_SEXT %1(<vscale x 8 x s1>)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            sext_nxv8i64_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv8i64_nxv8i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: sext_nxv8i64_nxv8i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s1>) = COPY $v0
+    %0:_(<vscale x 8 x s64>) = G_SEXT %1(<vscale x 8 x s1>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            sext_nxv16i8_nxv16i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv16i8_nxv16i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 16 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: sext_nxv16i8_nxv16i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 16 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 16 x s1>) = COPY $v0
+    %0:_(<vscale x 16 x s8>) = G_SEXT %1(<vscale x 16 x s1>)
+    $v8m2 = COPY %0(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            sext_nxv16i16_nxv16i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv16i16_nxv16i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 16 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: sext_nxv16i16_nxv16i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 16 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 16 x s1>) = COPY $v0
+    %0:_(<vscale x 16 x s16>) = G_SEXT %1(<vscale x 16 x s1>)
+    $v8m4 = COPY %0(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            sext_nxv16i32_nxv16i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv16i32_nxv16i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 16 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: sext_nxv16i32_nxv16i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 16 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s1>) = COPY $v0
+    %0:_(<vscale x 16 x s32>) = G_SEXT %1(<vscale x 16 x s1>)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            sext_nxv32i8_nxv32i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv32i8_nxv32i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 32 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: sext_nxv32i8_nxv32i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 32 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 32 x s1>) = COPY $v0
+    %0:_(<vscale x 32 x s8>) = G_SEXT %1(<vscale x 32 x s1>)
+    $v8m4 = COPY %0(<vscale x 32 x s8>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            sext_nxv32i16_nxv32i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv32i16_nxv32i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 32 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: sext_nxv32i16_nxv32i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 32 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 32 x s1>) = COPY $v0
+    %0:_(<vscale x 32 x s16>) = G_SEXT %1(<vscale x 32 x s1>)
+    $v8m8 = COPY %0(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            sext_nxv64i8_nxv64i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv64i8_nxv64i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 64 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SELECT [[COPY]](<vscale x 64 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 64 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: sext_nxv64i8_nxv64i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 64 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SELECT [[COPY]](<vscale x 64 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 64 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 64 x s1>) = COPY $v0
+    %0:_(<vscale x 64 x s8>) = G_SEXT %1(<vscale x 64 x s1>)
+    $v8m8 = COPY %0(<vscale x 64 x s8>)
+    PseudoRET implicit $v8m8
+...
+
+# Extend from s8 element vectors
+---
+name:            sext_nxv1i16_nxv1i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv1i16_nxv1i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv1i16_nxv1i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = COPY $v8
+    %0:_(<vscale x 1 x s16>) = G_SEXT %1(<vscale x 1 x s8>)
+    $v8 = COPY %0(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv1i32_nxv1i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv1i32_nxv1i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv1i32_nxv1i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = COPY $v8
+    %0:_(<vscale x 1 x s32>) = G_SEXT %1(<vscale x 1 x s8>)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv1i64_nxv1i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv1i64_nxv1i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv1i64_nxv1i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = COPY $v8
+    %0:_(<vscale x 1 x s64>) = G_SEXT %1(<vscale x 1 x s8>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv2i16_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv2i16_nxv2i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv2i16_nxv2i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s8>) = COPY $v8
+    %0:_(<vscale x 2 x s16>) = G_SEXT %1(<vscale x 2 x s8>)
+    $v8 = COPY %0(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv2i32_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv2i32_nxv2i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv2i32_nxv2i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s8>) = COPY $v8
+    %0:_(<vscale x 2 x s32>) = G_SEXT %1(<vscale x 2 x s8>)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv2i64_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv2i64_nxv2i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: sext_nxv2i64_nxv2i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s8>) = COPY $v8
+    %0:_(<vscale x 2 x s64>) = G_SEXT %1(<vscale x 2 x s8>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            sext_nxv4i16_nxv4i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv4i16_nxv4i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[SEXT]](<vscale x 4 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv4i16_nxv4i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[SEXT]](<vscale x 4 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s8>) = COPY $v8
+    %0:_(<vscale x 4 x s16>) = G_SEXT %1(<vscale x 4 x s8>)
+    $v8 = COPY %0(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv4i32_nxv4i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv4i32_nxv4i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 4 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: sext_nxv4i32_nxv4i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 4 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s8>) = COPY $v8
+    %0:_(<vscale x 4 x s32>) = G_SEXT %1(<vscale x 4 x s8>)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            sext_nxv4i64_nxv4i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv4i64_nxv4i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: sext_nxv4i64_nxv4i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s8>) = COPY $v8
+    %0:_(<vscale x 4 x s64>) = G_SEXT %1(<vscale x 4 x s8>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            sext_nxv8i16_nxv8i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv8i16_nxv8i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 8 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: sext_nxv8i16_nxv8i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 8 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 8 x s8>) = COPY $v8
+    %0:_(<vscale x 8 x s16>) = G_SEXT %1(<vscale x 8 x s8>)
+    $v8m2 = COPY %0(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            sext_nxv8i32_nxv8i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv8i32_nxv8i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 8 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: sext_nxv8i32_nxv8i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 8 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s8>) = COPY $v8
+    %0:_(<vscale x 8 x s32>) = G_SEXT %1(<vscale x 8 x s8>)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            sext_nxv8i64_nxv8i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv8i64_nxv8i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: sext_nxv8i64_nxv8i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s8>) = COPY $v8
+    %0:_(<vscale x 8 x s64>) = G_SEXT %1(<vscale x 8 x s8>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            sext_nxv16i16_nxv16i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv16i16_nxv16i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 16 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: sext_nxv16i16_nxv16i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 16 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    %0:_(<vscale x 16 x s16>) = G_SEXT %1(<vscale x 16 x s8>)
+    $v8m4 = COPY %0(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            sext_nxv16i32_nxv16i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv16i32_nxv16i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 16 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: sext_nxv16i32_nxv16i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 16 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    %0:_(<vscale x 16 x s32>) = G_SEXT %1(<vscale x 16 x s8>)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            sext_nxv32i16_nxv32i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv32i16_nxv32i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV32-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 32 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: sext_nxv32i16_nxv32i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV64-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 32 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 32 x s8>) = COPY $v8m4
+    %0:_(<vscale x 32 x s16>) = G_SEXT %1(<vscale x 32 x s8>)
+    $v8m8 = COPY %0(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+...
+
+# Extend from s16 element vectors
+---
+name:            sext_nxv1i32_nxv1i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv1i32_nxv1i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv1i32_nxv1i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s16>) = COPY $v8
+    %0:_(<vscale x 1 x s32>) = G_SEXT %1(<vscale x 1 x s16>)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv1i64_nxv1i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv1i64_nxv1i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv1i64_nxv1i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s16>) = COPY $v8
+    %0:_(<vscale x 1 x s64>) = G_SEXT %1(<vscale x 1 x s16>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv2i32_nxv2i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv2i32_nxv2i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv2i32_nxv2i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s16>) = COPY $v8
+    %0:_(<vscale x 2 x s32>) = G_SEXT %1(<vscale x 2 x s16>)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv2i64_nxv2i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv2i64_nxv2i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: sext_nxv2i64_nxv2i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s16>) = COPY $v8
+    %0:_(<vscale x 2 x s64>) = G_SEXT %1(<vscale x 2 x s16>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            sext_nxv4i32_nxv4i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv4i32_nxv4i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 4 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: sext_nxv4i32_nxv4i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 4 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    %0:_(<vscale x 4 x s32>) = G_SEXT %1(<vscale x 4 x s16>)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            sext_nxv4i64_nxv4i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv4i64_nxv4i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: sext_nxv4i64_nxv4i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    %0:_(<vscale x 4 x s64>) = G_SEXT %1(<vscale x 4 x s16>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            sext_nxv8i32_nxv8i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv8i32_nxv8i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 8 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: sext_nxv8i32_nxv8i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 8 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s16>) = COPY $v8m2
+    %0:_(<vscale x 8 x s32>) = G_SEXT %1(<vscale x 8 x s16>)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            sext_nxv8i64_nxv8i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv8i64_nxv8i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: sext_nxv8i64_nxv8i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s16>) = COPY $v8m2
+    %0:_(<vscale x 8 x s64>) = G_SEXT %1(<vscale x 8 x s16>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            sext_nxv16i32_nxv16i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv16i32_nxv16i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV32-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 16 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: sext_nxv16i32_nxv16i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV64-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 16 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s16>) = COPY $v8m4
+    %0:_(<vscale x 16 x s32>) = G_SEXT %1(<vscale x 16 x s16>)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+
+# Extend from s32 element vectors
+---
+name:            sext_nxv1i64_nxv1i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv1i64_nxv1i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s32>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV32-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv1i64_nxv1i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s32>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV64-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s32>) = COPY $v8
+    %0:_(<vscale x 1 x s64>) = G_SEXT %1(<vscale x 1 x s32>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv2i64_nxv2i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv2i64_nxv2i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV32-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: sext_nxv2i64_nxv2i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV64-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    %0:_(<vscale x 2 x s64>) = G_SEXT %1(<vscale x 2 x s32>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            sext_nxv4i64_nxv4i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv4i64_nxv4i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV32-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: sext_nxv4i64_nxv4i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV64-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s32>) = COPY $v8m2
+    %0:_(<vscale x 4 x s64>) = G_SEXT %1(<vscale x 4 x s32>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            sext_nxv8i64_nxv8i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv8i64_nxv8i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV32-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: sext_nxv8i64_nxv8i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV64-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s32>) = COPY $v8m4
+    %0:_(<vscale x 8 x s64>) = G_SEXT %1(<vscale x 8 x s32>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-splatvector-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-splatvector-rv32.mir
new file mode 100644
index 0000000..109536a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-splatvector-rv32.mir
@@ -0,0 +1,694 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s
+
+---
+name:            splatvector_nxv1i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv1i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 1 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv1i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv1i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 1 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv1i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv1i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[AND1]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 1 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s32) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s32)
+    %2:_(<vscale x 1 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv2i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv2i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 2 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv2i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv2i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 2 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv2i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv2i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[AND1]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 2 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s32) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s32)
+    %2:_(<vscale x 2 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv4i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv4i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 4 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv4i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv4i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 4 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv4i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv4i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[AND1]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 4 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s32) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s32)
+    %2:_(<vscale x 4 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv8i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv8i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 8 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv8i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv8i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 8 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv8i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv8i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[AND1]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 8 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s32) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s32)
+    %2:_(<vscale x 8 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv16i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv16i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 16 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 16 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 16 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv16i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv16i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 16 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 16 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 16 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv16i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv16i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[AND1]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 16 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 16 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s32) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s32)
+    %2:_(<vscale x 16 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv32i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv32i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 32 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 32 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 32 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 32 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv32i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv32i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 32 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 32 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 32 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 32 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv32i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv32i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[AND1]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 32 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 32 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s32) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s32)
+    %2:_(<vscale x 32 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 32 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv64i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv64i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 64 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 64 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 64 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 64 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 64 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv64i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv64i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 64 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 64 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 64 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 64 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 64 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv64i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv64i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[AND1]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 64 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 64 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 64 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s32) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s32)
+    %2:_(<vscale x 64 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 64 x s1>)
+    PseudoRET implicit $v0
+...
+
+---
+name:            splatvector_nxv1i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv1i8
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 1 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR %1(s8)
+    $v8 = COPY %2(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+
+...
+
+---
+name:            splatvector_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv2i8
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR %1(s8)
+    $v8 = COPY %2(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv4i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv4i8
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR %1(s8)
+    $v8 = COPY %2(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv8i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv8i8
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR %1(s8)
+    $v8 = COPY %2(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv16i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv16i8
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[SPLAT_VECTOR]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR %1(s8)
+    $v8m2 = COPY %2(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            splatvector_nxv1i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv1i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 1 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s16) = G_CONSTANT i16 0
+    %2:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR %1(s16)
+    $v8 = COPY %2(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv2i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv2i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 2 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s16) = G_CONSTANT i16 0
+    %2:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR %1(s16)
+    $v8 = COPY %2(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv4i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv4i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s16) = G_CONSTANT i16 0
+    %2:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR %1(s16)
+    $v8 = COPY %2(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv8i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv8i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[SPLAT_VECTOR]](<vscale x 8 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(s16) = G_CONSTANT i16 0
+    %2:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR %1(s16)
+    $v8m2 = COPY %2(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            splatvector_nxv16i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv16i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[SPLAT_VECTOR]](<vscale x 16 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(s16) = G_CONSTANT i16 0
+    %2:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR %1(s16)
+    $v8m4 = COPY %2(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            splatvector_nxv1i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv1i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 1 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR %1(s32)
+    $v8 = COPY %2(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv2i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv2i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR %1(s32)
+    $v8 = COPY %2(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv4i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv4i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[SPLAT_VECTOR]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR %1(s32)
+    $v8m2 = COPY %2(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            splatvector_nxv8i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv8i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[SPLAT_VECTOR]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR %1(s32)
+    $v8m4 = COPY %2(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            splatvector_nxv16i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv16i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8m8 = COPY [[SPLAT_VECTOR]](<vscale x 16 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR %1(s32)
+    $v8m8 = COPY %2(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-splatvector-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-splatvector-rv64.mir
new file mode 100644
index 0000000..7bf5f83
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-splatvector-rv64.mir
@@ -0,0 +1,817 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s
+
+---
+name:            splatvector_nxv1i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv1i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 1 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv1i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv1i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 1 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv1i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv1i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[AND1]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 1 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s64) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s64)
+    %2:_(<vscale x 1 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv2i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv2i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 2 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv2i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv2i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 2 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv2i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv2i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[AND1]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 2 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s64) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s64)
+    %2:_(<vscale x 2 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv4i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv4i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 4 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv4i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv4i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 4 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv4i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv4i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[AND1]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 4 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s64) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s64)
+    %2:_(<vscale x 4 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv8i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv8i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 8 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv8i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv8i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 8 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv8i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv8i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[AND1]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 8 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s64) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s64)
+    %2:_(<vscale x 8 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv16i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv16i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 16 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 16 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 16 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv16i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv16i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 16 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 16 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 16 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv16i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv16i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[AND1]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 16 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 16 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s64) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s64)
+    %2:_(<vscale x 16 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv32i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv32i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 32 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 32 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 32 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 32 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv32i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv32i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 32 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 32 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 32 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 32 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv32i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv32i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[AND1]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 32 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 32 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s64) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s64)
+    %2:_(<vscale x 32 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 32 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv64i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv64i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 64 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 64 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 64 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 64 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 64 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv64i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv64i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 64 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 64 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 64 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 64 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 64 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv64i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv64i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[AND1]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 64 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 64 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 64 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s64) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s64)
+    %2:_(<vscale x 64 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 64 x s1>)
+    PseudoRET implicit $v0
+...
+
+---
+name:            splatvector_nxv1i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv1i8
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 1 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR %1(s8)
+    $v8 = COPY %2(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+
+...
+
+---
+name:            splatvector_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv2i8
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR %1(s8)
+    $v8 = COPY %2(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv4i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv4i8
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR %1(s8)
+    $v8 = COPY %2(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv8i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv8i8
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR %1(s8)
+    $v8 = COPY %2(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv16i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv16i8
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[SPLAT_VECTOR]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR %1(s8)
+    $v8m2 = COPY %2(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            splatvector_nxv1i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv1i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 1 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %1:_(s16) = G_CONSTANT i16 0
+    %2:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR %1(s16)
+    $v8 = COPY %2(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv2i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv2i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 2 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    %1:_(s16) = G_CONSTANT i16 0
+    %2:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR %1(s16)
+    $v8 = COPY %2(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv4i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv4i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %1:_(s16) = G_CONSTANT i16 0
+    %2:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR %1(s16)
+    $v8 = COPY %2(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv8i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv8i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[SPLAT_VECTOR]](<vscale x 8 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    %1:_(s16) = G_CONSTANT i16 0
+    %2:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR %1(s16)
+    $v8m2 = COPY %2(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            splatvector_nxv16i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv16i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[SPLAT_VECTOR]](<vscale x 16 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    %1:_(s16) = G_CONSTANT i16 0
+    %2:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR %1(s16)
+    $v8m4 = COPY %2(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            splatvector_nxv1i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv1i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 1 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR %1(s32)
+    $v8 = COPY %2(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv2i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv2i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR %1(s32)
+    $v8 = COPY %2(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv4i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv4i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[SPLAT_VECTOR]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR %1(s32)
+    $v8m2 = COPY %2(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            splatvector_nxv8i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv8i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[SPLAT_VECTOR]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR %1(s32)
+    $v8m4 = COPY %2(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            splatvector_nxv16i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv16i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8m8 = COPY [[SPLAT_VECTOR]](<vscale x 16 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR %1(s32)
+    $v8m8 = COPY %2(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            splatvector_nxv1i64
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv1i64
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 1 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    %1:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR %1(s64)
+    $v8 = COPY %2(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv2i64
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv2i64
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[SPLAT_VECTOR]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %1:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR %1(s64)
+    $v8m2 = COPY %2(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            splatvector_nxv4i64
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv4i64
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[SPLAT_VECTOR]](<vscale x 4 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    %1:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR %1(s64)
+    $v8m4 = COPY %2(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            splatvector_nxv8i64
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv8i64
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; CHECK-NEXT: $v8m8 = COPY [[SPLAT_VECTOR]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %1:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR %1(s64)
+    $v8m8 = COPY %2(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-splatvector-s64-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-splatvector-s64-rv32.mir
new file mode 100644
index 0000000..806c9b9
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-splatvector-s64-rv32.mir
@@ -0,0 +1,116 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck --check-prefix=HasF64 %s
+# RUN: llc -mtriple=riscv32 -mattr=+Zve64x -run-pass=legalizer %s -o - | FileCheck --check-prefix=NoF64 %s
+
+---
+name:            splatvector_nxv1i64
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; HasF64-LABEL: name: splatvector_nxv1i64
+    ; HasF64: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; HasF64-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; HasF64-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[DEF]](s32), [[DEF1]](s32)
+    ; HasF64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; HasF64-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 1 x s64>)
+    ; HasF64-NEXT: PseudoRET implicit $v8
+    ;
+    ; NoF64-LABEL: name: splatvector_nxv1i64
+    ; NoF64: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL $x0
+    ; NoF64-NEXT: [[DEF2:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[SPLAT_VECTOR_SPLIT_I64_VL:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR_SPLIT_I64_VL [[DEF2]], [[DEF]](s32), [[DEF1]], $x0
+    ; NoF64-NEXT: $v8 = COPY [[SPLAT_VECTOR_SPLIT_I64_VL]](<vscale x 1 x s64>)
+    ; NoF64-NEXT: PseudoRET implicit $v8
+    %0:_(s64) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR %0(s64)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv2i64
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; HasF64-LABEL: name: splatvector_nxv2i64
+    ; HasF64: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; HasF64-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; HasF64-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[DEF]](s32), [[DEF1]](s32)
+    ; HasF64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; HasF64-NEXT: $v8m2 = COPY [[SPLAT_VECTOR]](<vscale x 2 x s64>)
+    ; HasF64-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; NoF64-LABEL: name: splatvector_nxv2i64
+    ; NoF64: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL $x0
+    ; NoF64-NEXT: [[DEF2:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[SPLAT_VECTOR_SPLIT_I64_VL:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR_SPLIT_I64_VL [[DEF2]], [[DEF]](s32), [[DEF1]], $x0
+    ; NoF64-NEXT: $v8m2 = COPY [[SPLAT_VECTOR_SPLIT_I64_VL]](<vscale x 2 x s64>)
+    ; NoF64-NEXT: PseudoRET implicit $v8m2
+    %0:_(s64) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR %0(s64)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            splatvector_nxv4i64
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; HasF64-LABEL: name: splatvector_nxv4i64
+    ; HasF64: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; HasF64-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; HasF64-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[DEF]](s32), [[DEF1]](s32)
+    ; HasF64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; HasF64-NEXT: $v8m4 = COPY [[SPLAT_VECTOR]](<vscale x 4 x s64>)
+    ; HasF64-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; NoF64-LABEL: name: splatvector_nxv4i64
+    ; NoF64: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
+    ; NoF64-NEXT: [[DEF2:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[SPLAT_VECTOR_SPLIT_I64_VL:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR_SPLIT_I64_VL [[DEF2]], [[DEF]](s32), [[DEF1]], $x0
+    ; NoF64-NEXT: $v8m4 = COPY [[SPLAT_VECTOR_SPLIT_I64_VL]](<vscale x 4 x s64>)
+    ; NoF64-NEXT: PseudoRET implicit $v8m4
+    %0:_(s64) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR %0(s64)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            splatvector_nxv8i64
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; HasF64-LABEL: name: splatvector_nxv8i64
+    ; HasF64: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; HasF64-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; HasF64-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[DEF]](s32), [[DEF1]](s32)
+    ; HasF64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; HasF64-NEXT: $v8m8 = COPY [[SPLAT_VECTOR]](<vscale x 8 x s64>)
+    ; HasF64-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; NoF64-LABEL: name: splatvector_nxv8i64
+    ; NoF64: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; NoF64-NEXT: [[DEF2:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[SPLAT_VECTOR_SPLIT_I64_VL:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR_SPLIT_I64_VL [[DEF2]], [[DEF]](s32), [[DEF1]], $x0
+    ; NoF64-NEXT: $v8m8 = COPY [[SPLAT_VECTOR_SPLIT_I64_VL]](<vscale x 8 x s64>)
+    ; NoF64-NEXT: PseudoRET implicit $v8m8
+    %0:_(s64) = G_IMPLICIT_DEF
+    %1:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR %0(s64)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-xor.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-xor.mir
index 4de02b1..8a34521 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-xor.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-xor.mir
@@ -9,8 +9,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv1i8
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v9
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8 = COPY [[OR]](<vscale x 1 x s8>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8 = COPY [[XOR]](<vscale x 1 x s8>)
     ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 1 x s8>) = COPY $v8
     %1:_(<vscale x 1 x s8>) = COPY $v9
@@ -27,8 +27,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv2i8
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v9
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8 = COPY [[OR]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8 = COPY [[XOR]](<vscale x 2 x s8>)
     ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 2 x s8>) = COPY $v8
     %1:_(<vscale x 2 x s8>) = COPY $v9
@@ -45,8 +45,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv4i8
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v9
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8 = COPY [[OR]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8 = COPY [[XOR]](<vscale x 4 x s8>)
     ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 4 x s8>) = COPY $v8
     %1:_(<vscale x 4 x s8>) = COPY $v9
@@ -63,8 +63,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv8i8
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v9
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8 = COPY [[OR]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8 = COPY [[XOR]](<vscale x 8 x s8>)
     ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 8 x s8>) = COPY $v8
     %1:_(<vscale x 8 x s8>) = COPY $v9
@@ -81,8 +81,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv16i8
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v10m2
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m2 = COPY [[OR]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m2 = COPY [[XOR]](<vscale x 16 x s8>)
     ; CHECK-NEXT: PseudoRET implicit $v8m2
     %0:_(<vscale x 16 x s8>) = COPY $v8m2
     %1:_(<vscale x 16 x s8>) = COPY $v10m2
@@ -99,8 +99,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv32i8
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 32 x s8>) = COPY $v8m4
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 32 x s8>) = COPY $v12m4
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m4 = COPY [[OR]](<vscale x 32 x s8>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m4 = COPY [[XOR]](<vscale x 32 x s8>)
     ; CHECK-NEXT: PseudoRET implicit $v8m4
     %0:_(<vscale x 32 x s8>) = COPY $v8m4
     %1:_(<vscale x 32 x s8>) = COPY $v12m4
@@ -117,8 +117,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv64i8
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 64 x s8>) = COPY $v8m8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 64 x s8>) = COPY $v16m8
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m8 = COPY [[OR]](<vscale x 64 x s8>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m8 = COPY [[XOR]](<vscale x 64 x s8>)
     ; CHECK-NEXT: PseudoRET implicit $v8m8
     %0:_(<vscale x 64 x s8>) = COPY $v8m8
     %1:_(<vscale x 64 x s8>) = COPY $v16m8
@@ -135,8 +135,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv1i16
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v9
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 1 x s16>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8 = COPY [[OR]](<vscale x 1 x s16>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 1 x s16>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8 = COPY [[XOR]](<vscale x 1 x s16>)
     ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 1 x s16>) = COPY $v8
     %1:_(<vscale x 1 x s16>) = COPY $v9
@@ -153,8 +153,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv2i16
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v9
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 2 x s16>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8 = COPY [[OR]](<vscale x 2 x s16>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 2 x s16>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8 = COPY [[XOR]](<vscale x 2 x s16>)
     ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 2 x s16>) = COPY $v8
     %1:_(<vscale x 2 x s16>) = COPY $v9
@@ -171,8 +171,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv4i16
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v9
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8 = COPY [[OR]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8 = COPY [[XOR]](<vscale x 4 x s16>)
     ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 4 x s16>) = COPY $v8
     %1:_(<vscale x 4 x s16>) = COPY $v9
@@ -189,8 +189,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv8i16
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v10m2
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 8 x s16>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m2 = COPY [[OR]](<vscale x 8 x s16>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 8 x s16>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m2 = COPY [[XOR]](<vscale x 8 x s16>)
     ; CHECK-NEXT: PseudoRET implicit $v8m2
     %0:_(<vscale x 8 x s16>) = COPY $v8m2
     %1:_(<vscale x 8 x s16>) = COPY $v10m2
@@ -207,8 +207,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv16i16
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 16 x s16>) = COPY $v8m4
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 16 x s16>) = COPY $v12m4
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 16 x s16>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m4 = COPY [[OR]](<vscale x 16 x s16>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 16 x s16>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m4 = COPY [[XOR]](<vscale x 16 x s16>)
     ; CHECK-NEXT: PseudoRET implicit $v8m4
     %0:_(<vscale x 16 x s16>) = COPY $v8m4
     %1:_(<vscale x 16 x s16>) = COPY $v12m4
@@ -225,8 +225,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv32i16
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 32 x s16>) = COPY $v8m8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 32 x s16>) = COPY $v16m8
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 32 x s16>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m8 = COPY [[OR]](<vscale x 32 x s16>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 32 x s16>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m8 = COPY [[XOR]](<vscale x 32 x s16>)
     ; CHECK-NEXT: PseudoRET implicit $v8m8
     %0:_(<vscale x 32 x s16>) = COPY $v8m8
     %1:_(<vscale x 32 x s16>) = COPY $v16m8
@@ -243,8 +243,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv1i32
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 1 x s32>) = COPY $v8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 1 x s32>) = COPY $v9
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 1 x s32>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8 = COPY [[OR]](<vscale x 1 x s32>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 1 x s32>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8 = COPY [[XOR]](<vscale x 1 x s32>)
     ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 1 x s32>) = COPY $v8
     %1:_(<vscale x 1 x s32>) = COPY $v9
@@ -261,8 +261,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv2i32
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v9
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8 = COPY [[OR]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8 = COPY [[XOR]](<vscale x 2 x s32>)
     ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 2 x s32>) = COPY $v8
     %1:_(<vscale x 2 x s32>) = COPY $v9
@@ -279,8 +279,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv4i32
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v10m2
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 4 x s32>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m2 = COPY [[OR]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 4 x s32>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m2 = COPY [[XOR]](<vscale x 4 x s32>)
     ; CHECK-NEXT: PseudoRET implicit $v8m2
     %0:_(<vscale x 4 x s32>) = COPY $v8m2
     %1:_(<vscale x 4 x s32>) = COPY $v10m2
@@ -297,8 +297,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv8i32
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 8 x s32>) = COPY $v8m4
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 8 x s32>) = COPY $v12m4
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m4 = COPY [[OR]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m4 = COPY [[XOR]](<vscale x 8 x s32>)
     ; CHECK-NEXT: PseudoRET implicit $v8m4
     %0:_(<vscale x 8 x s32>) = COPY $v8m4
     %1:_(<vscale x 8 x s32>) = COPY $v12m4
@@ -315,8 +315,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv16i32
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 16 x s32>) = COPY $v8m8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 16 x s32>) = COPY $v16m8
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 16 x s32>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m8 = COPY [[OR]](<vscale x 16 x s32>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 16 x s32>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m8 = COPY [[XOR]](<vscale x 16 x s32>)
     ; CHECK-NEXT: PseudoRET implicit $v8m8
     %0:_(<vscale x 16 x s32>) = COPY $v8m8
     %1:_(<vscale x 16 x s32>) = COPY $v16m8
@@ -333,8 +333,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv1i64
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 1 x s64>) = COPY $v8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 1 x s64>) = COPY $v9
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8 = COPY [[OR]](<vscale x 1 x s64>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8 = COPY [[XOR]](<vscale x 1 x s64>)
     ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 1 x s64>) = COPY $v8
     %1:_(<vscale x 1 x s64>) = COPY $v9
@@ -351,8 +351,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv2i64
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v8m2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v10m2
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m2 = COPY [[OR]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m2 = COPY [[XOR]](<vscale x 2 x s64>)
     ; CHECK-NEXT: PseudoRET implicit $v8m2
     %0:_(<vscale x 2 x s64>) = COPY $v8m2
     %1:_(<vscale x 2 x s64>) = COPY $v10m2
@@ -369,8 +369,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv4i64
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 4 x s64>) = COPY $v8m4
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s64>) = COPY $v12m4
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 4 x s64>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m4 = COPY [[OR]](<vscale x 4 x s64>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 4 x s64>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m4 = COPY [[XOR]](<vscale x 4 x s64>)
     ; CHECK-NEXT: PseudoRET implicit $v8m4
     %0:_(<vscale x 4 x s64>) = COPY $v8m4
     %1:_(<vscale x 4 x s64>) = COPY $v12m4
@@ -387,8 +387,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv8i64
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 8 x s64>) = COPY $v8m8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 8 x s64>) = COPY $v16m8
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m8 = COPY [[OR]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m8 = COPY [[XOR]](<vscale x 8 x s64>)
     ; CHECK-NEXT: PseudoRET implicit $v8m8
     %0:_(<vscale x 8 x s64>) = COPY $v8m8
     %1:_(<vscale x 8 x s64>) = COPY $v16m8
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-zext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-zext.mir
new file mode 100644
index 0000000..fe4ddfa
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-zext.mir
@@ -0,0 +1,1589 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck --check-prefix=RV32 %s
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=legalizer %s -o - | FileCheck --check-prefix=RV64 %s
+
+# Extend from s1 element vectors
+---
+name:            zext_nxv1i8_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv1i8_nxv1i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv1i8_nxv1i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v8
+    %0:_(<vscale x 1 x s8>) = G_ZEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv1i16_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv1i16_nxv1i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv1i16_nxv1i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v8
+    %0:_(<vscale x 1 x s16>) = G_ZEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv1i32_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv1i32_nxv1i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv1i32_nxv1i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v8
+    %0:_(<vscale x 1 x s32>) = G_ZEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv1i64_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv1i64_nxv1i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv1i64_nxv1i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v8
+    %0:_(<vscale x 1 x s64>) = G_ZEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv2i8_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv2i8_nxv2i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv2i8_nxv2i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s1>) = COPY $v8
+    %0:_(<vscale x 2 x s8>) = G_ZEXT %1(<vscale x 2 x s1>)
+    $v8 = COPY %0(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv2i16_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv2i16_nxv2i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv2i16_nxv2i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s1>) = COPY $v8
+    %0:_(<vscale x 2 x s16>) = G_ZEXT %1(<vscale x 2 x s1>)
+    $v8 = COPY %0(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv2i32_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv2i32_nxv2i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv2i32_nxv2i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s1>) = COPY $v8
+    %0:_(<vscale x 2 x s32>) = G_ZEXT %1(<vscale x 2 x s1>)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv2i64_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv2i64_nxv2i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: zext_nxv2i64_nxv2i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s1>) = COPY $v8
+    %0:_(<vscale x 2 x s64>) = G_ZEXT %1(<vscale x 2 x s1>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            zext_nxv4i8_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv4i8_nxv4i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv4i8_nxv4i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s1>) = COPY $v8
+    %0:_(<vscale x 4 x s8>) = G_ZEXT %1(<vscale x 4 x s1>)
+    $v8 = COPY %0(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv4i16_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv4i16_nxv4i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv4i16_nxv4i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s1>) = COPY $v8
+    %0:_(<vscale x 4 x s16>) = G_ZEXT %1(<vscale x 4 x s1>)
+    $v8 = COPY %0(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv4i32_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv4i32_nxv4i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 4 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: zext_nxv4i32_nxv4i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 4 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s1>) = COPY $v8
+    %0:_(<vscale x 4 x s32>) = G_ZEXT %1(<vscale x 4 x s1>)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            zext_nxv4i64_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv4i64_nxv4i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: zext_nxv4i64_nxv4i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s1>) = COPY $v8
+    %0:_(<vscale x 4 x s64>) = G_ZEXT %1(<vscale x 4 x s1>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            zext_nxv8i8_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv8i8_nxv8i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 8 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv8i8_nxv8i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 8 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 8 x s1>) = COPY $v8
+    %0:_(<vscale x 8 x s8>) = G_ZEXT %1(<vscale x 8 x s1>)
+    $v8 = COPY %0(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv8i16_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv8i16_nxv8i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 8 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: zext_nxv8i16_nxv8i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 8 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 8 x s1>) = COPY $v8
+    %0:_(<vscale x 8 x s16>) = G_ZEXT %1(<vscale x 8 x s1>)
+    $v8m2 = COPY %0(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            zext_nxv8i32_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv8i32_nxv8i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 8 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: zext_nxv8i32_nxv8i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 8 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s1>) = COPY $v8
+    %0:_(<vscale x 8 x s32>) = G_ZEXT %1(<vscale x 8 x s1>)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            zext_nxv8i64_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv8i64_nxv8i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: zext_nxv8i64_nxv8i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s1>) = COPY $v8
+    %0:_(<vscale x 8 x s64>) = G_ZEXT %1(<vscale x 8 x s1>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            zext_nxv16i8_nxv16i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv16i8_nxv16i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 16 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: zext_nxv16i8_nxv16i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 16 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 16 x s1>) = COPY $v8
+    %0:_(<vscale x 16 x s8>) = G_ZEXT %1(<vscale x 16 x s1>)
+    $v8m2 = COPY %0(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            zext_nxv16i16_nxv16i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv16i16_nxv16i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 16 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: zext_nxv16i16_nxv16i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 16 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 16 x s1>) = COPY $v8
+    %0:_(<vscale x 16 x s16>) = G_ZEXT %1(<vscale x 16 x s1>)
+    $v8m4 = COPY %0(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            zext_nxv16i32_nxv16i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv16i32_nxv16i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 16 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: zext_nxv16i32_nxv16i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 16 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s1>) = COPY $v8
+    %0:_(<vscale x 16 x s32>) = G_ZEXT %1(<vscale x 16 x s1>)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            zext_nxv32i8_nxv32i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv32i8_nxv32i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 32 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: zext_nxv32i8_nxv32i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 32 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 32 x s1>) = COPY $v8
+    %0:_(<vscale x 32 x s8>) = G_ZEXT %1(<vscale x 32 x s1>)
+    $v8m4 = COPY %0(<vscale x 32 x s8>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            zext_nxv32i16_nxv32i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv32i16_nxv32i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 32 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: zext_nxv32i16_nxv32i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 32 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 32 x s1>) = COPY $v8
+    %0:_(<vscale x 32 x s16>) = G_ZEXT %1(<vscale x 32 x s1>)
+    $v8m8 = COPY %0(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            zext_nxv64i8_nxv64i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv64i8_nxv64i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 64 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SELECT [[COPY]](<vscale x 64 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 64 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: zext_nxv64i8_nxv64i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 64 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SELECT [[COPY]](<vscale x 64 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 64 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 64 x s1>) = COPY $v8
+    %0:_(<vscale x 64 x s8>) = G_ZEXT %1(<vscale x 64 x s1>)
+    $v8m8 = COPY %0(<vscale x 64 x s8>)
+    PseudoRET implicit $v8m8
+...
+
+# Extend from s8 element vectors
+---
+name:            zext_nxv1i16_nxv1i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv1i16_nxv1i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv1i16_nxv1i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = COPY $v8
+    %0:_(<vscale x 1 x s16>) = G_ZEXT %1(<vscale x 1 x s8>)
+    $v8 = COPY %0(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv1i32_nxv1i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv1i32_nxv1i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv1i32_nxv1i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = COPY $v8
+    %0:_(<vscale x 1 x s32>) = G_ZEXT %1(<vscale x 1 x s8>)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv1i64_nxv1i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv1i64_nxv1i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv1i64_nxv1i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = COPY $v8
+    %0:_(<vscale x 1 x s64>) = G_ZEXT %1(<vscale x 1 x s8>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv2i16_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv2i16_nxv2i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv2i16_nxv2i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s8>) = COPY $v8
+    %0:_(<vscale x 2 x s16>) = G_ZEXT %1(<vscale x 2 x s8>)
+    $v8 = COPY %0(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv2i32_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv2i32_nxv2i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv2i32_nxv2i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s8>) = COPY $v8
+    %0:_(<vscale x 2 x s32>) = G_ZEXT %1(<vscale x 2 x s8>)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv2i64_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv2i64_nxv2i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: zext_nxv2i64_nxv2i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s8>) = COPY $v8
+    %0:_(<vscale x 2 x s64>) = G_ZEXT %1(<vscale x 2 x s8>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            zext_nxv4i16_nxv4i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv4i16_nxv4i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ZEXT]](<vscale x 4 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv4i16_nxv4i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ZEXT]](<vscale x 4 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s8>) = COPY $v8
+    %0:_(<vscale x 4 x s16>) = G_ZEXT %1(<vscale x 4 x s8>)
+    $v8 = COPY %0(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv4i32_nxv4i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv4i32_nxv4i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 4 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: zext_nxv4i32_nxv4i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 4 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s8>) = COPY $v8
+    %0:_(<vscale x 4 x s32>) = G_ZEXT %1(<vscale x 4 x s8>)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            zext_nxv4i64_nxv4i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv4i64_nxv4i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: zext_nxv4i64_nxv4i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s8>) = COPY $v8
+    %0:_(<vscale x 4 x s64>) = G_ZEXT %1(<vscale x 4 x s8>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            zext_nxv8i16_nxv8i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv8i16_nxv8i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 8 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: zext_nxv8i16_nxv8i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 8 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 8 x s8>) = COPY $v8
+    %0:_(<vscale x 8 x s16>) = G_ZEXT %1(<vscale x 8 x s8>)
+    $v8m2 = COPY %0(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            zext_nxv8i32_nxv8i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv8i32_nxv8i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 8 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: zext_nxv8i32_nxv8i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 8 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s8>) = COPY $v8
+    %0:_(<vscale x 8 x s32>) = G_ZEXT %1(<vscale x 8 x s8>)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            zext_nxv8i64_nxv8i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv8i64_nxv8i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: zext_nxv8i64_nxv8i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s8>) = COPY $v8
+    %0:_(<vscale x 8 x s64>) = G_ZEXT %1(<vscale x 8 x s8>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            zext_nxv16i16_nxv16i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv16i16_nxv16i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_ZEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 16 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: zext_nxv16i16_nxv16i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_ZEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 16 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    %0:_(<vscale x 16 x s16>) = G_ZEXT %1(<vscale x 16 x s8>)
+    $v8m4 = COPY %0(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            zext_nxv16i32_nxv16i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv16i32_nxv16i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_ZEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 16 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: zext_nxv16i32_nxv16i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_ZEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 16 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    %0:_(<vscale x 16 x s32>) = G_ZEXT %1(<vscale x 16 x s8>)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            zext_nxv32i16_nxv32i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv32i16_nxv32i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_ZEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV32-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 32 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: zext_nxv32i16_nxv32i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_ZEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV64-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 32 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 32 x s8>) = COPY $v8m4
+    %0:_(<vscale x 32 x s16>) = G_ZEXT %1(<vscale x 32 x s8>)
+    $v8m8 = COPY %0(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+...
+
+# Extend from s16 element vectors
+---
+name:            zext_nxv1i32_nxv1i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv1i32_nxv1i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_ZEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv1i32_nxv1i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_ZEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s16>) = COPY $v8
+    %0:_(<vscale x 1 x s32>) = G_ZEXT %1(<vscale x 1 x s16>)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv1i64_nxv1i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv1i64_nxv1i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv1i64_nxv1i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s16>) = COPY $v8
+    %0:_(<vscale x 1 x s64>) = G_ZEXT %1(<vscale x 1 x s16>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv2i32_nxv2i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv2i32_nxv2i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_ZEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv2i32_nxv2i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_ZEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s16>) = COPY $v8
+    %0:_(<vscale x 2 x s32>) = G_ZEXT %1(<vscale x 2 x s16>)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv2i64_nxv2i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv2i64_nxv2i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: zext_nxv2i64_nxv2i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s16>) = COPY $v8
+    %0:_(<vscale x 2 x s64>) = G_ZEXT %1(<vscale x 2 x s16>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            zext_nxv4i32_nxv4i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv4i32_nxv4i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_ZEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 4 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: zext_nxv4i32_nxv4i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_ZEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 4 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    %0:_(<vscale x 4 x s32>) = G_ZEXT %1(<vscale x 4 x s16>)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            zext_nxv4i64_nxv4i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv4i64_nxv4i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: zext_nxv4i64_nxv4i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    %0:_(<vscale x 4 x s64>) = G_ZEXT %1(<vscale x 4 x s16>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            zext_nxv8i32_nxv8i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv8i32_nxv8i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_ZEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 8 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: zext_nxv8i32_nxv8i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_ZEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 8 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s16>) = COPY $v8m2
+    %0:_(<vscale x 8 x s32>) = G_ZEXT %1(<vscale x 8 x s16>)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            zext_nxv8i64_nxv8i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv8i64_nxv8i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m4
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: zext_nxv8i64_nxv8i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m4
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s16>) = COPY $v8m4
+    %0:_(<vscale x 8 x s64>) = G_ZEXT %1(<vscale x 8 x s16>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            zext_nxv16i32_nxv16i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv16i32_nxv16i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_ZEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV32-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 16 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: zext_nxv16i32_nxv16i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_ZEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV64-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 16 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s16>) = COPY $v8m4
+    %0:_(<vscale x 16 x s32>) = G_ZEXT %1(<vscale x 16 x s16>)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+
+# Extend from s32 element vectors
+---
+name:            zext_nxv1i64_nxv1i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv1i64_nxv1i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s32>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV32-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv1i64_nxv1i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s32>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV64-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s32>) = COPY $v8
+    %0:_(<vscale x 1 x s64>) = G_ZEXT %1(<vscale x 1 x s32>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv2i64_nxv2i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv2i64_nxv2i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV32-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: zext_nxv2i64_nxv2i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV64-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    %0:_(<vscale x 2 x s64>) = G_ZEXT %1(<vscale x 2 x s32>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            zext_nxv4i64_nxv4i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv4i64_nxv4i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV32-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: zext_nxv4i64_nxv4i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV64-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s32>) = COPY $v8m2
+    %0:_(<vscale x 4 x s64>) = G_ZEXT %1(<vscale x 4 x s32>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            zext_nxv8i64_nxv8i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv8i64_nxv8i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV32-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: zext_nxv8i64_nxv8i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV64-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s32>) = COPY $v8m4
+    %0:_(<vscale x 8 x s64>) = G_ZEXT %1(<vscale x 8 x s32>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/anyext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/anyext.mir
new file mode 100644
index 0000000..062179c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/anyext.mir
@@ -0,0 +1,820 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV32I %s
+# RUN: llc -mtriple=riscv64 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV64I %s
+
+---
+name:            anyext_nxv1i16_nxv1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i16_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s16>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i16_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s16>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = COPY $v8
+    %1:_(<vscale x 1 x s16>) = G_ANYEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv1i32_nxv1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i32_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i32_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = COPY $v8
+    %1:_(<vscale x 1 x s32>) = G_ANYEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv1i64_nxv1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i64_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i64_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = COPY $v8
+    %1:_(<vscale x 1 x s64>) = G_ANYEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i16_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i16_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s16>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i16_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s16>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s8>) = COPY $v8
+    %1:_(<vscale x 2 x s16>) = G_ANYEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i32_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i32_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i32_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s8>) = COPY $v8
+    %1:_(<vscale x 2 x s32>) = G_ANYEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i64_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i64_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i64_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 2 x s8>) = COPY $v8
+    %1:_(<vscale x 2 x s64>) = G_ANYEXT %0(<vscale x 2 x s8>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv4i16_nxv4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i16_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i16_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s8>) = COPY $v8
+    %1:_(<vscale x 4 x s16>) = G_ANYEXT %0(<vscale x 4 x s8>)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv4i32_nxv4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i32_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 4 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i32_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 4 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 4 x s8>) = COPY $v8
+    %1:_(<vscale x 4 x s32>) = G_ANYEXT %0(<vscale x 4 x s8>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv4i64_nxv4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i64_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i64_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 4 x s8>) = COPY $v8
+    %1:_(<vscale x 4 x s64>) = G_ANYEXT %0(<vscale x 4 x s8>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv8i16_nxv8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i16_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s16>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 8 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i16_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s16>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 8 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 8 x s8>) = COPY $v8
+    %1:_(<vscale x 8 x s16>) = G_ANYEXT %0(<vscale x 8 x s8>)
+    $v8m2 = COPY %1(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv8i32_nxv8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i32_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 8 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i32_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 8 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 8 x s8>) = COPY $v8
+    %1:_(<vscale x 8 x s32>) = G_ANYEXT %0(<vscale x 8 x s8>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv8i64_nxv8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i64_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i64_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 8 x s8>) = COPY $v8
+    %1:_(<vscale x 8 x s64>) = G_ANYEXT %0(<vscale x 8 x s8>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            anyext_nxv16i16_nxv16i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv16i16_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 16 x s16>) = G_ANYEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 16 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv16i16_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 16 x s16>) = G_ANYEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 16 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 16 x s8>) = COPY $v8m2
+    %1:_(<vscale x 16 x s16>) = G_ANYEXT %0(<vscale x 16 x s8>)
+    $v8m4 = COPY %1(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv16i32_nxv16i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv16i32_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m4
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_ANYEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 16 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv16i32_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m4
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_ANYEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 16 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 16 x s8>) = COPY $v8m4
+    %1:_(<vscale x 16 x s32>) = G_ANYEXT %0(<vscale x 16 x s8>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            anyext_nxv32i16_nxv32i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv32i16_nxv32i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 32 x s16>) = G_ANYEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 32 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv32i16_nxv32i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 32 x s16>) = G_ANYEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 32 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 32 x s8>) = COPY $v8m4
+    %1:_(<vscale x 32 x s16>) = G_ANYEXT %0(<vscale x 32 x s8>)
+    $v8m8 = COPY %1(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            anyext_nxv1i32_nxv1i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i32_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_ANYEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i32_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_ANYEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s16>) = COPY $v8
+    %1:_(<vscale x 1 x s32>) = G_ANYEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv1i64_nxv1i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i64_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i64_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s16>) = COPY $v8
+    %1:_(<vscale x 1 x s64>) = G_ANYEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i32_nxv2i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i32_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_ANYEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i32_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_ANYEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s16>) = COPY $v8
+    %1:_(<vscale x 2 x s32>) = G_ANYEXT %0(<vscale x 2 x s16>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i64_nxv2i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i64_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i64_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 2 x s16>) = COPY $v8
+    %1:_(<vscale x 2 x s64>) = G_ANYEXT %0(<vscale x 2 x s16>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv4i32_nxv4i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i32_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_ANYEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 4 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i32_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_ANYEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 4 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 4 x s16>) = COPY $v8
+    %1:_(<vscale x 4 x s32>) = G_ANYEXT %0(<vscale x 4 x s16>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv4i64_nxv4i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i64_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i64_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 4 x s16>) = COPY $v8
+    %1:_(<vscale x 4 x s64>) = G_ANYEXT %0(<vscale x 4 x s16>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv8i32_nxv8i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i32_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_ANYEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 8 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i32_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_ANYEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 8 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 8 x s16>) = COPY $v8m2
+    %1:_(<vscale x 8 x s32>) = G_ANYEXT %0(<vscale x 8 x s16>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv8i64_nxv8i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i64_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i64_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 8 x s16>) = COPY $v8m2
+    %1:_(<vscale x 8 x s64>) = G_ANYEXT %0(<vscale x 8 x s16>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            anyext_nxv16i32_nxv16i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv16i32_nxv16i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_ANYEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 16 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv16i32_nxv16i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_ANYEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 16 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 16 x s16>) = COPY $v8m4
+    %1:_(<vscale x 16 x s32>) = G_ANYEXT %0(<vscale x 16 x s16>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            anyext_nxv1i64_nxv1i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i64_nxv1i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s32>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV32I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i64_nxv1i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s32>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV64I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s32>) = COPY $v8
+    %1:_(<vscale x 1 x s64>) = G_ANYEXT %0(<vscale x 1 x s32>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i64_nxv2i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i64_nxv2i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i64_nxv2i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 2 x s32>) = COPY $v8
+    %1:_(<vscale x 2 x s64>) = G_ANYEXT %0(<vscale x 2 x s32>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv4i64_nxv4i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i64_nxv4i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i64_nxv4i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 4 x s32>) = COPY $v8m2
+    %1:_(<vscale x 4 x s64>) = G_ANYEXT %0(<vscale x 4 x s32>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv8i64_nxv8i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i64_nxv8i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i64_nxv8i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 8 x s32>) = COPY $v8m4
+    %1:_(<vscale x 8 x s64>) = G_ANYEXT %0(<vscale x 8 x s32>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/icmp.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/icmp.mir
new file mode 100644
index 0000000..925d6ae
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/icmp.mir
@@ -0,0 +1,675 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV32I %s
+# RUN: llc -mtriple=riscv64 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV64I %s
+
+---
+name:            icmp_nxv1i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv1i1
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s1>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv1i1
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s1>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 1 x s1>), %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv2i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv2i1
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s1>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv2i1
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s1>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 2 x s1>), %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv4i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv4i1
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s1>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv4i1
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s1>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 4 x s1>), %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv8i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv8i1
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s1>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv8i1
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s1>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 8 x s1>), %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv16i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv16i1
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s1>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv16i1
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s1>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 16 x s1>), %0
+    $v8 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv32i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv32i1
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 32 x s1>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 32 x s1>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv32i1
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 32 x s1>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 32 x s1>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 32 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 32 x s1>), %0
+    $v8 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv64i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv64i1
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 64 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 64 x s1>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 64 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv64i1
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 64 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 64 x s1>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 64 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 64 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 64 x s1>), %0
+    $v8 = COPY %1(<vscale x 64 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv1i8
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s8>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv1i8
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s8>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 1 x s8>), %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv2i8
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s8>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv2i8
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s8>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 2 x s8>), %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv4i8
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s8>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv4i8
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s8>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 4 x s8>), %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv8i8
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s8>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv8i8
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s8>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 8 x s8>), %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv16i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv16i8
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s8>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv16i8
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s8>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 16 x s8>), %0
+    $v8 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv32i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv32i8
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 32 x s8>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 32 x s8>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv32i8
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 32 x s8>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 32 x s8>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 32 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 32 x s8>), %0
+    $v8 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv64i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv64i8
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 64 x s8>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 64 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 64 x s8>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 64 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv64i8
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 64 x s8>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 64 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 64 x s8>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 64 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 64 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 64 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 64 x s8>), %0
+    $v8 = COPY %1(<vscale x 64 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv1i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv1i16
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s16>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv1i16
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s16>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 1 x s16>), %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv2i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv2i16
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s16>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv2i16
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s16>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 2 x s16>), %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv4i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv4i16
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s16>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv4i16
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s16>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 4 x s16>), %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv8i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv8i16
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s16>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv8i16
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s16>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 8 x s16>), %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv16i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv16i16
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s16>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv16i16
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s16>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 16 x s16>), %0
+    $v8 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv32i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv32i16
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 32 x s16>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 32 x s16>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv32i16
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 32 x s16>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 32 x s16>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 32 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 32 x s16>), %0
+    $v8 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv1i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv1i32
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s32>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv1i32
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s32>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 1 x s32>), %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv2i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv2i32
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s32>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv2i32
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s32>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 2 x s32>), %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv4i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv4i32
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s32>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv4i32
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s32>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 4 x s32>), %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv8i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv8i32
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s32>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv8i32
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s32>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 8 x s32>), %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv16i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv16i32
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s32>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv16i32
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s32>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 16 x s32>), %0
+    $v8 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv1i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv1i64
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s64>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv1i64
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s64>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 1 x s64>), %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv2i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv2i64
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s64>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv2i64
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s64>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 2 x s64>), %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv4i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv4i64
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s64>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv4i64
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s64>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 4 x s64>), %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv8i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv8i64
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s64>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv8i64
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s64>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 8 x s64>), %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/sext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/sext.mir
new file mode 100644
index 0000000..a754b8b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/sext.mir
@@ -0,0 +1,820 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV32I %s
+# RUN: llc -mtriple=riscv64 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV64I %s
+
+---
+name:            sext_nxv1i16_nxv1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i16_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s16>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i16_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s16>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = COPY $v8
+    %1:_(<vscale x 1 x s16>) = G_SEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv1i32_nxv1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i32_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i32_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = COPY $v8
+    %1:_(<vscale x 1 x s32>) = G_SEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv1i64_nxv1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i64_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i64_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = COPY $v8
+    %1:_(<vscale x 1 x s64>) = G_SEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i16_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i16_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s16>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i16_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s16>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s8>) = COPY $v8
+    %1:_(<vscale x 2 x s16>) = G_SEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i32_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i32_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i32_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s8>) = COPY $v8
+    %1:_(<vscale x 2 x s32>) = G_SEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i64_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i64_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i64_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 2 x s8>) = COPY $v8
+    %1:_(<vscale x 2 x s64>) = G_SEXT %0(<vscale x 2 x s8>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv4i16_nxv4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i16_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[SEXT]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i16_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[SEXT]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s8>) = COPY $v8
+    %1:_(<vscale x 4 x s16>) = G_SEXT %0(<vscale x 4 x s8>)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv4i32_nxv4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i32_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 4 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i32_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 4 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 4 x s8>) = COPY $v8
+    %1:_(<vscale x 4 x s32>) = G_SEXT %0(<vscale x 4 x s8>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv4i64_nxv4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i64_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i64_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 4 x s8>) = COPY $v8
+    %1:_(<vscale x 4 x s64>) = G_SEXT %0(<vscale x 4 x s8>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv8i16_nxv8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i16_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s16>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 8 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i16_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s16>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 8 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 8 x s8>) = COPY $v8
+    %1:_(<vscale x 8 x s16>) = G_SEXT %0(<vscale x 8 x s8>)
+    $v8m2 = COPY %1(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv8i32_nxv8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i32_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 8 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i32_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 8 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 8 x s8>) = COPY $v8
+    %1:_(<vscale x 8 x s32>) = G_SEXT %0(<vscale x 8 x s8>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv8i64_nxv8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i64_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i64_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 8 x s8>) = COPY $v8
+    %1:_(<vscale x 8 x s64>) = G_SEXT %0(<vscale x 8 x s8>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            sext_nxv16i16_nxv16i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv16i16_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 16 x s16>) = G_SEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 16 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv16i16_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 16 x s16>) = G_SEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 16 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 16 x s8>) = COPY $v8m2
+    %1:_(<vscale x 16 x s16>) = G_SEXT %0(<vscale x 16 x s8>)
+    $v8m4 = COPY %1(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv16i32_nxv16i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv16i32_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_SEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 16 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv16i32_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_SEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 16 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 16 x s8>) = COPY $v8m2
+    %1:_(<vscale x 16 x s32>) = G_SEXT %0(<vscale x 16 x s8>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            sext_nxv32i16_nxv32i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv32i16_nxv32i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 32 x s16>) = G_SEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV32I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 32 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv32i16_nxv32i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 32 x s16>) = G_SEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV64I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 32 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 32 x s8>) = COPY $v8m4
+    %1:_(<vscale x 32 x s16>) = G_SEXT %0(<vscale x 32 x s8>)
+    $v8m8 = COPY %1(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            sext_nxv1i32_nxv1i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i32_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_SEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i32_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_SEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s16>) = COPY $v8
+    %1:_(<vscale x 1 x s32>) = G_SEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv1i64_nxv1i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i64_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i64_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s16>) = COPY $v8
+    %1:_(<vscale x 1 x s64>) = G_SEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i32_nxv2i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i32_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_SEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32I-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i32_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_SEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64I-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s16>) = COPY $v8
+    %1:_(<vscale x 2 x s32>) = G_SEXT %0(<vscale x 2 x s16>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i64_nxv2i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i64_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i64_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 2 x s16>) = COPY $v8
+    %1:_(<vscale x 2 x s64>) = G_SEXT %0(<vscale x 2 x s16>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv4i32_nxv4i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i32_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_SEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 4 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i32_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_SEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 4 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 4 x s16>) = COPY $v8
+    %1:_(<vscale x 4 x s32>) = G_SEXT %0(<vscale x 4 x s16>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv4i64_nxv4i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i64_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i64_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 4 x s16>) = COPY $v8
+    %1:_(<vscale x 4 x s64>) = G_SEXT %0(<vscale x 4 x s16>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv8i32_nxv8i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i32_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_SEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 8 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i32_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_SEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 8 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 8 x s16>) = COPY $v8m2
+    %1:_(<vscale x 8 x s32>) = G_SEXT %0(<vscale x 8 x s16>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv8i64_nxv8i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i64_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i64_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 8 x s16>) = COPY $v8m2
+    %1:_(<vscale x 8 x s64>) = G_SEXT %0(<vscale x 8 x s16>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            sext_nxv16i32_nxv16i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv16i32_nxv16i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_SEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV32I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 16 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv16i32_nxv16i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_SEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV64I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 16 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 16 x s16>) = COPY $v8m4
+    %1:_(<vscale x 16 x s32>) = G_SEXT %0(<vscale x 16 x s16>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            sext_nxv1i64_nxv1i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i64_nxv1i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s32>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV32I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i64_nxv1i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s32>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV64I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s32>) = COPY $v8
+    %1:_(<vscale x 1 x s64>) = G_SEXT %0(<vscale x 1 x s32>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i64_nxv2i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i64_nxv2i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i64_nxv2i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 2 x s32>) = COPY $v8
+    %1:_(<vscale x 2 x s64>) = G_SEXT %0(<vscale x 2 x s32>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv4i64_nxv4i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i64_nxv4i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV32I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i64_nxv4i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV64I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 4 x s32>) = COPY $v8m2
+    %1:_(<vscale x 4 x s64>) = G_SEXT %0(<vscale x 4 x s32>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv8i64_nxv8i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i64_nxv8i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV32I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i64_nxv8i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV64I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 8 x s32>) = COPY $v8m4
+    %1:_(<vscale x 8 x s64>) = G_SEXT %0(<vscale x 8 x s32>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/zext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/zext.mir
new file mode 100644
index 0000000..c3bc4a9
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/zext.mir
@@ -0,0 +1,820 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV32I %s
+# RUN: llc -mtriple=riscv64 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV64I %s
+
+---
+name:            zext_nxv1i16_nxv1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i16_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s16>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i16_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s16>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = COPY $v8
+    %1:_(<vscale x 1 x s16>) = G_ZEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv1i32_nxv1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i32_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i32_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = COPY $v8
+    %1:_(<vscale x 1 x s32>) = G_ZEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv1i64_nxv1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i64_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i64_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = COPY $v8
+    %1:_(<vscale x 1 x s64>) = G_ZEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i16_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i16_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s16>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i16_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s16>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s8>) = COPY $v8
+    %1:_(<vscale x 2 x s16>) = G_ZEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i32_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i32_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i32_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s8>) = COPY $v8
+    %1:_(<vscale x 2 x s32>) = G_ZEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i64_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i64_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i64_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 2 x s8>) = COPY $v8
+    %1:_(<vscale x 2 x s64>) = G_ZEXT %0(<vscale x 2 x s8>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv4i16_nxv4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i16_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i16_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s8>) = COPY $v8
+    %1:_(<vscale x 4 x s16>) = G_ZEXT %0(<vscale x 4 x s8>)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv4i32_nxv4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i32_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 4 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i32_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 4 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 4 x s8>) = COPY $v8
+    %1:_(<vscale x 4 x s32>) = G_ZEXT %0(<vscale x 4 x s8>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv4i64_nxv4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i64_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i64_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 4 x s8>) = COPY $v8
+    %1:_(<vscale x 4 x s64>) = G_ZEXT %0(<vscale x 4 x s8>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv8i16_nxv8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i16_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s16>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 8 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i16_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s16>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 8 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 8 x s8>) = COPY $v8
+    %1:_(<vscale x 8 x s16>) = G_ZEXT %0(<vscale x 8 x s8>)
+    $v8m2 = COPY %1(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv8i32_nxv8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i32_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 8 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i32_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 8 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 8 x s8>) = COPY $v8
+    %1:_(<vscale x 8 x s32>) = G_ZEXT %0(<vscale x 8 x s8>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv8i64_nxv8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i64_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i64_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 8 x s8>) = COPY $v8
+    %1:_(<vscale x 8 x s64>) = G_ZEXT %0(<vscale x 8 x s8>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            zext_nxv16i16_nxv16i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv16i16_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 16 x s16>) = G_ZEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 16 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv16i16_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 16 x s16>) = G_ZEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 16 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 16 x s8>) = COPY $v8m2
+    %1:_(<vscale x 16 x s16>) = G_ZEXT %0(<vscale x 16 x s8>)
+    $v8m4 = COPY %1(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv16i32_nxv16i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv16i32_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_ZEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 16 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv16i32_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_ZEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 16 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 16 x s8>) = COPY $v8m2
+    %1:_(<vscale x 16 x s32>) = G_ZEXT %0(<vscale x 16 x s8>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            zext_nxv32i16_nxv32i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv32i16_nxv32i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 32 x s16>) = G_ZEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 32 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv32i16_nxv32i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 32 x s16>) = G_ZEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 32 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 32 x s8>) = COPY $v8m4
+    %1:_(<vscale x 32 x s16>) = G_ZEXT %0(<vscale x 32 x s8>)
+    $v8m8 = COPY %1(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            zext_nxv1i32_nxv1i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i32_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_ZEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i32_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_ZEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s16>) = COPY $v8
+    %1:_(<vscale x 1 x s32>) = G_ZEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv1i64_nxv1i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i64_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i64_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s16>) = COPY $v8
+    %1:_(<vscale x 1 x s64>) = G_ZEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i32_nxv2i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i32_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_ZEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i32_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_ZEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s16>) = COPY $v8
+    %1:_(<vscale x 2 x s32>) = G_ZEXT %0(<vscale x 2 x s16>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i64_nxv2i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i64_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i64_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 2 x s16>) = COPY $v8
+    %1:_(<vscale x 2 x s64>) = G_ZEXT %0(<vscale x 2 x s16>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv4i32_nxv4i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i32_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_ZEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 4 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i32_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_ZEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 4 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 4 x s16>) = COPY $v8
+    %1:_(<vscale x 4 x s32>) = G_ZEXT %0(<vscale x 4 x s16>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv4i64_nxv4i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i64_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i64_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 4 x s16>) = COPY $v8
+    %1:_(<vscale x 4 x s64>) = G_ZEXT %0(<vscale x 4 x s16>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv8i32_nxv8i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i32_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_ZEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 8 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i32_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_ZEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 8 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 8 x s16>) = COPY $v8m2
+    %1:_(<vscale x 8 x s32>) = G_ZEXT %0(<vscale x 8 x s16>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv8i64_nxv8i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i64_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m4
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i64_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m4
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 8 x s16>) = COPY $v8m4
+    %1:_(<vscale x 8 x s64>) = G_ZEXT %0(<vscale x 8 x s16>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            zext_nxv16i32_nxv16i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv16i32_nxv16i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_ZEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 16 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv16i32_nxv16i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_ZEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 16 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 16 x s16>) = COPY $v8m4
+    %1:_(<vscale x 16 x s32>) = G_ZEXT %0(<vscale x 16 x s16>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            zext_nxv1i64_nxv1i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i64_nxv1i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s32>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV32I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i64_nxv1i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s32>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV64I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s32>) = COPY $v8
+    %1:_(<vscale x 1 x s64>) = G_ZEXT %0(<vscale x 1 x s32>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i64_nxv2i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i64_nxv2i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i64_nxv2i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 2 x s32>) = COPY $v8
+    %1:_(<vscale x 2 x s64>) = G_ZEXT %0(<vscale x 2 x s32>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv4i64_nxv4i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i64_nxv4i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i64_nxv4i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 4 x s32>) = COPY $v8m2
+    %1:_(<vscale x 4 x s64>) = G_ZEXT %0(<vscale x 4 x s32>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv8i64_nxv8i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i64_nxv8i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i64_nxv8i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 8 x s32>) = COPY $v8m4
+    %1:_(<vscale x 8 x s64>) = G_ZEXT %0(<vscale x 8 x s32>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
index bafa92e..65d0768 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
@@ -18,14 +18,12 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ; RV32-NEXT:    vmsne.vi v0, v8, 0
 ; RV32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    vmerge.vim v8, v8, -1, v0
-; RV32-NEXT:    vand.vv v8, v11, v8
+; RV32-NEXT:    vmerge.vvm v8, v8, v11, v0
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    sub a0, a0, a1
-; RV32-NEXT:    lui a1, 16
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 16
+; RV32-NEXT:    srli a0, a0, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ctz_nxv4i32:
@@ -41,14 +39,12 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ; RV64-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmerge.vim v8, v8, -1, v0
-; RV64-NEXT:    vand.vv v8, v11, v8
+; RV64-NEXT:    vmerge.vvm v8, v8, v11, v0
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a1, v8
-; RV64-NEXT:    sub a0, a0, a1
-; RV64-NEXT:    lui a1, 16
-; RV64-NEXT:    addiw a1, a1, -1
-; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    subw a0, a0, a1
+; RV64-NEXT:    slli a0, a0, 48
+; RV64-NEXT:    srli a0, a0, 48
 ; RV64-NEXT:    ret
   %res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i32(<vscale x 4 x i32> %a, i1 0)
   ret i32 %res
@@ -158,8 +154,7 @@ define i32 @ctz_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
 ; RV64-NEXT:    li a1, -1
 ; RV64-NEXT:    vmadd.vx v16, a1, v8
 ; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmerge.vim v8, v8, -1, v0
-; RV64-NEXT:    vand.vv v8, v16, v8
+; RV64-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a1, v8
 ; RV64-NEXT:    subw a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll
index f5305a1..83d1d1b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll
@@ -19,10 +19,9 @@ define <4 x i64> @vwsll_vv_v4i64_sext(<4 x i32> %a, <4 x i32> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v4i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <4 x i32> %a to <4 x i64>
   %y = sext <4 x i32> %b to <4 x i64>
@@ -41,10 +40,9 @@ define <4 x i64> @vwsll_vv_v4i64_zext(<4 x i32> %a, <4 x i32> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v4i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <4 x i32> %a to <4 x i64>
   %y = zext <4 x i32> %b to <4 x i64>
@@ -62,9 +60,9 @@ define <4 x i64> @vwsll_vx_i64_v4i64(<4 x i32> %a, i64 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i64_v4i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vx v8, v10, a0
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i64> poison, i64 %b, i32 0
   %splat = shufflevector <4 x i64> %head, <4 x i64> poison, <4 x i32> zeroinitializer
@@ -88,10 +86,8 @@ define <4 x i64> @vwsll_vx_i32_v4i64_sext(<4 x i32> %a, i32 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <4 x i32> %head, <4 x i32> poison, <4 x i32> zeroinitializer
@@ -116,10 +112,8 @@ define <4 x i64> @vwsll_vx_i32_v4i64_zext(<4 x i32> %a, i32 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <4 x i32> %head, <4 x i32> poison, <4 x i32> zeroinitializer
@@ -142,12 +136,9 @@ define <4 x i64> @vwsll_vx_i16_v4i64_sext(<4 x i32> %a, i16 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v4i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf4 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <4 x i16> %head, <4 x i16> poison, <4 x i32> zeroinitializer
@@ -170,12 +161,9 @@ define <4 x i64> @vwsll_vx_i16_v4i64_zext(<4 x i32> %a, i16 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v4i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf4 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <4 x i16> %head, <4 x i16> poison, <4 x i32> zeroinitializer
@@ -198,12 +186,9 @@ define <4 x i64> @vwsll_vx_i8_v4i64_sext(<4 x i32> %a, i8 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v4i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf8 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <4 x i8> %head, <4 x i8> poison, <4 x i32> zeroinitializer
@@ -226,12 +211,9 @@ define <4 x i64> @vwsll_vx_i8_v4i64_zext(<4 x i32> %a, i8 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v4i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf8 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <4 x i8> %head, <4 x i8> poison, <4 x i32> zeroinitializer
@@ -251,9 +233,9 @@ define <4 x i64> @vwsll_vi_v4i64(<4 x i32> %a) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_v4i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vi v8, v10, 2
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vi v10, v8, 2
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <4 x i32> %a to <4 x i64>
   %z = shl <4 x i64> %x, splat (i64 2)
@@ -275,10 +257,9 @@ define <8 x i32> @vwsll_vv_v8i32_sext(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v8i32_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <8 x i16> %a to <8 x i32>
   %y = sext <8 x i16> %b to <8 x i32>
@@ -297,10 +278,9 @@ define <8 x i32> @vwsll_vv_v8i32_zext(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v8i32_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <8 x i16> %a to <8 x i32>
   %y = zext <8 x i16> %b to <8 x i32>
@@ -318,9 +298,9 @@ define <8 x i32> @vwsll_vx_i64_v8i32(<8 x i16> %a, i64 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i64_v8i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vx v8, v10, a0
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i64> poison, i64 %b, i32 0
   %splat = shufflevector <8 x i64> %head, <8 x i64> poison, <8 x i32> zeroinitializer
@@ -340,9 +320,9 @@ define <8 x i32> @vwsll_vx_i32_v8i32(<8 x i16> %a, i32 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_v8i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vx v8, v10, a0
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <8 x i32> %head, <8 x i32> poison, <8 x i32> zeroinitializer
@@ -366,10 +346,8 @@ define <8 x i32> @vwsll_vx_i16_v8i32_sext(<8 x i16> %a, i16 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <8 x i16> %head, <8 x i16> poison, <8 x i32> zeroinitializer
@@ -394,10 +372,8 @@ define <8 x i32> @vwsll_vx_i16_v8i32_zext(<8 x i16> %a, i16 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <8 x i16> %head, <8 x i16> poison, <8 x i32> zeroinitializer
@@ -420,12 +396,9 @@ define <8 x i32> @vwsll_vx_i8_v8i32_sext(<8 x i16> %a, i8 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v8i32_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf4 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <8 x i8> %head, <8 x i8> poison, <8 x i32> zeroinitializer
@@ -448,12 +421,9 @@ define <8 x i32> @vwsll_vx_i8_v8i32_zext(<8 x i16> %a, i8 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v8i32_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf4 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <8 x i8> %head, <8 x i8> poison, <8 x i32> zeroinitializer
@@ -473,9 +443,9 @@ define <8 x i32> @vwsll_vi_v8i32(<8 x i16> %a) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_v8i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vi v8, v10, 2
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vi v10, v8, 2
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <8 x i16> %a to <8 x i32>
   %z = shl <8 x i32> %x, splat (i32 2)
@@ -497,10 +467,9 @@ define <16 x i16> @vwsll_vv_v16i16_sext(<16 x i8> %a, <16 x i8> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v16i16_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <16 x i8> %a to <16 x i16>
   %y = sext <16 x i8> %b to <16 x i16>
@@ -519,10 +488,9 @@ define <16 x i16> @vwsll_vv_v16i16_zext(<16 x i8> %a, <16 x i8> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v16i16_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <16 x i8> %a to <16 x i16>
   %y = zext <16 x i8> %b to <16 x i16>
@@ -552,12 +520,9 @@ define <16 x i16> @vwsll_vx_i32_v16i16(<16 x i8> %a, i32 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_v16i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v12, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vnsrl.wi v8, v12, 0
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v8
+; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <16 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <16 x i32> %head, <16 x i32> poison, <16 x i32> zeroinitializer
@@ -577,9 +542,9 @@ define <16 x i16> @vwsll_vx_i16_v16i16(<16 x i8> %a, i16 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v16i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vx v8, v10, a0
+; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <16 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <16 x i16> %head, <16 x i16> poison, <16 x i32> zeroinitializer
@@ -603,10 +568,8 @@ define <16 x i16> @vwsll_vx_i8_v16i16_sext(<16 x i8> %a, i8 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <16 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <16 x i8> %head, <16 x i8> poison, <16 x i32> zeroinitializer
@@ -631,10 +594,8 @@ define <16 x i16> @vwsll_vx_i8_v16i16_zext(<16 x i8> %a, i8 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <16 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <16 x i8> %head, <16 x i8> poison, <16 x i32> zeroinitializer
@@ -654,9 +615,9 @@ define <16 x i16> @vwsll_vi_v16i16(<16 x i8> %a) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_v16i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vi v8, v10, 2
+; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vi v10, v8, 2
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <16 x i8> %a to <16 x i16>
   %z = shl <16 x i16> %x, splat (i16 2)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fold-binop-into-select.ll b/llvm/test/CodeGen/RISCV/rvv/fold-binop-into-select.ll
new file mode 100644
index 0000000..3a8d08f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fold-binop-into-select.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+
+; The following binop x, (zext i1) tests will be vector-legalized into a vselect
+; of two splat_vectors, but on RV64 the splat value will be implicitly
+; truncated:
+;
+;       t15: nxv2i32 = splat_vector Constant:i64<1>
+;       t13: nxv2i32 = splat_vector Constant:i64<0>
+;     t16: nxv2i32 = vselect t2, t15, t13
+;   t7: nxv2i32 = add t4, t16
+;
+; Make sure that foldSelectWithIdentityConstant in DAGCombiner.cpp handles the
+; truncating splat, so we pull the vselect back and fold it into a mask.
+
+define <vscale x 2 x i32> @i1_zext_add(<vscale x 2 x i1> %a, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: i1_zext_add:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vadd.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    ret
+  %zext = zext <vscale x 2 x i1> %a to <vscale x 2 x i32>
+  %add = add <vscale x 2 x i32> %b, %zext
+  ret <vscale x 2 x i32> %add
+}
+
+define <vscale x 2 x i32> @i1_zext_add_commuted(<vscale x 2 x i1> %a, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: i1_zext_add_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vadd.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    ret
+  %zext = zext <vscale x 2 x i1> %a to <vscale x 2 x i32>
+  %add = add <vscale x 2 x i32> %zext, %b
+  ret <vscale x 2 x i32> %add
+}
+
+define <vscale x 2 x i32> @i1_zext_sub(<vscale x 2 x i1> %a, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: i1_zext_sub:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %zext = zext <vscale x 2 x i1> %a to <vscale x 2 x i32>
+  %sub = sub <vscale x 2 x i32> %b, %zext
+  ret <vscale x 2 x i32> %sub
+}
+
+define <vscale x 2 x i32> @i1_zext_or(<vscale x 2 x i1> %a, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: i1_zext_or:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vor.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    ret
+  %zext = zext <vscale x 2 x i1> %a to <vscale x 2 x i32>
+  %or = or <vscale x 2 x i32> %b, %zext
+  ret <vscale x 2 x i32> %or
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
index e56dca0..a14ce71 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
@@ -149,49 +149,49 @@ define <vscale x 2 x i64> @vwop_vscale_sext_i32i64_multiple_users(ptr %x, ptr %y
 }
 
 define <vscale x 2 x i32> @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y, ptr %z) {
-; RV32-LABEL: vwop_vscale_sext_i1i32_multiple_users:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
-; RV32-NEXT:    vlm.v v8, (a0)
-; RV32-NEXT:    vlm.v v9, (a1)
-; RV32-NEXT:    vlm.v v10, (a2)
-; RV32-NEXT:    vmv.v.i v11, 0
-; RV32-NEXT:    vmv.v.v v0, v8
-; RV32-NEXT:    vmerge.vim v12, v11, -1, v0
-; RV32-NEXT:    vmv.v.v v0, v9
-; RV32-NEXT:    vmerge.vim v9, v11, -1, v0
-; RV32-NEXT:    vmv.v.v v0, v10
-; RV32-NEXT:    vmerge.vim v10, v11, -1, v0
-; RV32-NEXT:    vmul.vv v9, v12, v9
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    vsub.vv v11, v12, v10
-; RV32-NEXT:    vmv.v.v v0, v8
-; RV32-NEXT:    vsub.vx v10, v10, a0, v0.t
-; RV32-NEXT:    vor.vv v8, v9, v10
-; RV32-NEXT:    vor.vv v8, v8, v11
-; RV32-NEXT:    ret
+; NO_FOLDING-LABEL: vwop_vscale_sext_i1i32_multiple_users:
+; NO_FOLDING:       # %bb.0:
+; NO_FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
+; NO_FOLDING-NEXT:    vlm.v v8, (a0)
+; NO_FOLDING-NEXT:    vlm.v v9, (a1)
+; NO_FOLDING-NEXT:    vlm.v v10, (a2)
+; NO_FOLDING-NEXT:    vmv.v.i v11, 0
+; NO_FOLDING-NEXT:    vmv.v.v v0, v8
+; NO_FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
+; NO_FOLDING-NEXT:    vmv.v.v v0, v9
+; NO_FOLDING-NEXT:    vmerge.vim v9, v11, -1, v0
+; NO_FOLDING-NEXT:    vmv.v.v v0, v10
+; NO_FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
+; NO_FOLDING-NEXT:    vmul.vv v9, v12, v9
+; NO_FOLDING-NEXT:    li a0, 1
+; NO_FOLDING-NEXT:    vsub.vv v11, v12, v10
+; NO_FOLDING-NEXT:    vmv.v.v v0, v8
+; NO_FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
+; NO_FOLDING-NEXT:    vor.vv v8, v9, v10
+; NO_FOLDING-NEXT:    vor.vv v8, v8, v11
+; NO_FOLDING-NEXT:    ret
 ;
-; RV64-LABEL: vwop_vscale_sext_i1i32_multiple_users:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
-; RV64-NEXT:    vlm.v v8, (a0)
-; RV64-NEXT:    vlm.v v9, (a1)
-; RV64-NEXT:    vlm.v v10, (a2)
-; RV64-NEXT:    vmv.v.i v11, 0
-; RV64-NEXT:    vmv.v.v v0, v8
-; RV64-NEXT:    vmerge.vim v12, v11, -1, v0
-; RV64-NEXT:    vmv.v.v v0, v9
-; RV64-NEXT:    vmerge.vim v9, v11, -1, v0
-; RV64-NEXT:    vmv.v.v v0, v10
-; RV64-NEXT:    vmerge.vim v10, v11, -1, v0
-; RV64-NEXT:    vmul.vv v9, v12, v9
-; RV64-NEXT:    vmv.v.v v0, v8
-; RV64-NEXT:    vmerge.vim v8, v11, 1, v0
-; RV64-NEXT:    vsub.vv v8, v10, v8
-; RV64-NEXT:    vsub.vv v10, v12, v10
-; RV64-NEXT:    vor.vv v8, v9, v8
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    ret
+; FOLDING-LABEL: vwop_vscale_sext_i1i32_multiple_users:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
+; FOLDING-NEXT:    vlm.v v8, (a0)
+; FOLDING-NEXT:    vlm.v v9, (a1)
+; FOLDING-NEXT:    vlm.v v10, (a2)
+; FOLDING-NEXT:    vmv.v.i v11, 0
+; FOLDING-NEXT:    vmv.v.v v0, v8
+; FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
+; FOLDING-NEXT:    vmv.v.v v0, v9
+; FOLDING-NEXT:    vmerge.vim v9, v11, -1, v0
+; FOLDING-NEXT:    vmv.v.v v0, v10
+; FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
+; FOLDING-NEXT:    vmul.vv v9, v12, v9
+; FOLDING-NEXT:    li a0, 1
+; FOLDING-NEXT:    vsub.vv v11, v12, v10
+; FOLDING-NEXT:    vmv.v.v v0, v8
+; FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
+; FOLDING-NEXT:    vor.vv v8, v9, v10
+; FOLDING-NEXT:    vor.vv v8, v8, v11
+; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
   %b = load <vscale x 2 x i1>, ptr %y
   %b2 = load <vscale x 2 x i1>, ptr %z
@@ -209,7 +209,7 @@ define <vscale x 2 x i32> @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y,
 define <vscale x 2 x i8> @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, ptr %z) {
 ; NO_FOLDING-LABEL: vwop_vscale_sext_i1i8_multiple_users:
 ; NO_FOLDING:       # %bb.0:
-; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; NO_FOLDING-NEXT:    vlm.v v8, (a0)
 ; NO_FOLDING-NEXT:    vlm.v v9, (a1)
 ; NO_FOLDING-NEXT:    vlm.v v10, (a2)
@@ -221,17 +221,17 @@ define <vscale x 2 x i8> @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v10
 ; NO_FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
 ; NO_FOLDING-NEXT:    vmul.vv v9, v12, v9
+; NO_FOLDING-NEXT:    li a0, 1
+; NO_FOLDING-NEXT:    vsub.vv v11, v12, v10
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v8
-; NO_FOLDING-NEXT:    vmerge.vim v8, v11, 1, v0
-; NO_FOLDING-NEXT:    vsub.vv v8, v10, v8
-; NO_FOLDING-NEXT:    vsub.vv v10, v12, v10
-; NO_FOLDING-NEXT:    vor.vv v8, v9, v8
-; NO_FOLDING-NEXT:    vor.vv v8, v8, v10
+; NO_FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
+; NO_FOLDING-NEXT:    vor.vv v8, v9, v10
+; NO_FOLDING-NEXT:    vor.vv v8, v8, v11
 ; NO_FOLDING-NEXT:    ret
 ;
 ; FOLDING-LABEL: vwop_vscale_sext_i1i8_multiple_users:
 ; FOLDING:       # %bb.0:
-; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; FOLDING-NEXT:    vlm.v v8, (a0)
 ; FOLDING-NEXT:    vlm.v v9, (a1)
 ; FOLDING-NEXT:    vlm.v v10, (a2)
@@ -243,12 +243,12 @@ define <vscale x 2 x i8> @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; FOLDING-NEXT:    vmv1r.v v0, v10
 ; FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
 ; FOLDING-NEXT:    vmul.vv v9, v12, v9
+; FOLDING-NEXT:    li a0, 1
+; FOLDING-NEXT:    vsub.vv v11, v12, v10
 ; FOLDING-NEXT:    vmv1r.v v0, v8
-; FOLDING-NEXT:    vmerge.vim v8, v11, 1, v0
-; FOLDING-NEXT:    vsub.vv v8, v10, v8
-; FOLDING-NEXT:    vsub.vv v10, v12, v10
-; FOLDING-NEXT:    vor.vv v8, v9, v8
-; FOLDING-NEXT:    vor.vv v8, v8, v10
+; FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
+; FOLDING-NEXT:    vor.vv v8, v9, v10
+; FOLDING-NEXT:    vor.vv v8, v8, v11
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
   %b = load <vscale x 2 x i1>, ptr %y
@@ -444,41 +444,39 @@ define <vscale x 2 x i64> @vwop_vscale_zext_i32i64_multiple_users(ptr %x, ptr %y
 }
 
 define <vscale x 2 x i32> @vwop_vscale_zext_i1i32_multiple_users(ptr %x, ptr %y, ptr %z) {
-; RV32-LABEL: vwop_vscale_zext_i1i32_multiple_users:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
-; RV32-NEXT:    vlm.v v0, (a0)
-; RV32-NEXT:    vlm.v v8, (a2)
-; RV32-NEXT:    vlm.v v9, (a1)
-; RV32-NEXT:    vmv.v.i v10, 0
-; RV32-NEXT:    vmerge.vim v11, v10, 1, v0
-; RV32-NEXT:    vmv.v.v v0, v8
-; RV32-NEXT:    vmerge.vim v8, v10, 1, v0
-; RV32-NEXT:    vadd.vv v10, v11, v8
-; RV32-NEXT:    vsub.vv v8, v11, v8
-; RV32-NEXT:    vmv.v.v v0, v9
-; RV32-NEXT:    vor.vv v10, v10, v11, v0.t
-; RV32-NEXT:    vor.vv v8, v10, v8
-; RV32-NEXT:    ret
+; NO_FOLDING-LABEL: vwop_vscale_zext_i1i32_multiple_users:
+; NO_FOLDING:       # %bb.0:
+; NO_FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
+; NO_FOLDING-NEXT:    vlm.v v0, (a0)
+; NO_FOLDING-NEXT:    vlm.v v8, (a2)
+; NO_FOLDING-NEXT:    vlm.v v9, (a1)
+; NO_FOLDING-NEXT:    vmv.v.i v10, 0
+; NO_FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
+; NO_FOLDING-NEXT:    vmv.v.v v0, v8
+; NO_FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
+; NO_FOLDING-NEXT:    vadd.vv v10, v11, v8
+; NO_FOLDING-NEXT:    vsub.vv v8, v11, v8
+; NO_FOLDING-NEXT:    vmv.v.v v0, v9
+; NO_FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; NO_FOLDING-NEXT:    vor.vv v8, v10, v8
+; NO_FOLDING-NEXT:    ret
 ;
-; RV64-LABEL: vwop_vscale_zext_i1i32_multiple_users:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
-; RV64-NEXT:    vlm.v v0, (a0)
-; RV64-NEXT:    vlm.v v8, (a1)
-; RV64-NEXT:    vlm.v v9, (a2)
-; RV64-NEXT:    vmv.v.i v10, 0
-; RV64-NEXT:    vmerge.vim v11, v10, 1, v0
-; RV64-NEXT:    vmv.v.v v0, v8
-; RV64-NEXT:    vmerge.vim v8, v10, 1, v0
-; RV64-NEXT:    vmv.v.v v0, v9
-; RV64-NEXT:    vmerge.vim v9, v10, 1, v0
-; RV64-NEXT:    vmul.vv v8, v11, v8
-; RV64-NEXT:    vadd.vv v10, v11, v9
-; RV64-NEXT:    vsub.vv v9, v11, v9
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    ret
+; FOLDING-LABEL: vwop_vscale_zext_i1i32_multiple_users:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
+; FOLDING-NEXT:    vlm.v v0, (a0)
+; FOLDING-NEXT:    vlm.v v8, (a2)
+; FOLDING-NEXT:    vlm.v v9, (a1)
+; FOLDING-NEXT:    vmv.v.i v10, 0
+; FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
+; FOLDING-NEXT:    vmv.v.v v0, v8
+; FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
+; FOLDING-NEXT:    vadd.vv v10, v11, v8
+; FOLDING-NEXT:    vsub.vv v8, v11, v8
+; FOLDING-NEXT:    vmv.v.v v0, v9
+; FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; FOLDING-NEXT:    vor.vv v8, v10, v8
+; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
   %b = load <vscale x 2 x i1>, ptr %y
   %b2 = load <vscale x 2 x i1>, ptr %z
@@ -496,40 +494,36 @@ define <vscale x 2 x i32> @vwop_vscale_zext_i1i32_multiple_users(ptr %x, ptr %y,
 define <vscale x 2 x i8> @vwop_vscale_zext_i1i8_multiple_users(ptr %x, ptr %y, ptr %z) {
 ; NO_FOLDING-LABEL: vwop_vscale_zext_i1i8_multiple_users:
 ; NO_FOLDING:       # %bb.0:
-; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; NO_FOLDING-NEXT:    vlm.v v0, (a0)
-; NO_FOLDING-NEXT:    vlm.v v8, (a1)
-; NO_FOLDING-NEXT:    vlm.v v9, (a2)
+; NO_FOLDING-NEXT:    vlm.v v8, (a2)
+; NO_FOLDING-NEXT:    vlm.v v9, (a1)
 ; NO_FOLDING-NEXT:    vmv.v.i v10, 0
 ; NO_FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v8
 ; NO_FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
+; NO_FOLDING-NEXT:    vadd.vv v10, v11, v8
+; NO_FOLDING-NEXT:    vsub.vv v8, v11, v8
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v9
-; NO_FOLDING-NEXT:    vmerge.vim v9, v10, 1, v0
-; NO_FOLDING-NEXT:    vmul.vv v8, v11, v8
-; NO_FOLDING-NEXT:    vadd.vv v10, v11, v9
-; NO_FOLDING-NEXT:    vsub.vv v9, v11, v9
-; NO_FOLDING-NEXT:    vor.vv v8, v8, v10
-; NO_FOLDING-NEXT:    vor.vv v8, v8, v9
+; NO_FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; NO_FOLDING-NEXT:    vor.vv v8, v10, v8
 ; NO_FOLDING-NEXT:    ret
 ;
 ; FOLDING-LABEL: vwop_vscale_zext_i1i8_multiple_users:
 ; FOLDING:       # %bb.0:
-; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; FOLDING-NEXT:    vlm.v v0, (a0)
-; FOLDING-NEXT:    vlm.v v8, (a1)
-; FOLDING-NEXT:    vlm.v v9, (a2)
+; FOLDING-NEXT:    vlm.v v8, (a2)
+; FOLDING-NEXT:    vlm.v v9, (a1)
 ; FOLDING-NEXT:    vmv.v.i v10, 0
 ; FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
 ; FOLDING-NEXT:    vmv1r.v v0, v8
 ; FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
+; FOLDING-NEXT:    vadd.vv v10, v11, v8
+; FOLDING-NEXT:    vsub.vv v8, v11, v8
 ; FOLDING-NEXT:    vmv1r.v v0, v9
-; FOLDING-NEXT:    vmerge.vim v9, v10, 1, v0
-; FOLDING-NEXT:    vmul.vv v8, v11, v8
-; FOLDING-NEXT:    vadd.vv v10, v11, v9
-; FOLDING-NEXT:    vsub.vv v9, v11, v9
-; FOLDING-NEXT:    vor.vv v8, v8, v10
-; FOLDING-NEXT:    vor.vv v8, v8, v9
+; FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; FOLDING-NEXT:    vor.vv v8, v10, v8
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
   %b = load <vscale x 2 x i1>, ptr %y
@@ -594,3 +588,6 @@ define <vscale x 2 x i32> @vwop_vscale_zext_i8i32_multiple_users(ptr %x, ptr %y,
 
 
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
index 0d52dd7..0a5e501 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
@@ -825,3 +825,56 @@ define <vscale x 2 x i1> @select_cond_x_cond(<vscale x 2 x i1> %x, <vscale x 2 x
   %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, <vscale x 2 x i1> %x, i32 %evl)
   ret <vscale x 2 x i1> %a
 }
+
+define <vscale x 2 x i1> @select_undef_T_F(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, i32 zeroext %evl) {
+; CHECK-LABEL: select_undef_T_F:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> poison, <vscale x 2 x i1> %x, <vscale x 2 x i1> %y, i32 %evl)
+  ret <vscale x 2 x i1> %a
+}
+
+define <vscale x 2 x i1> @select_undef_undef_F(<vscale x 2 x i1> %x, i32 zeroext %evl) {
+; CHECK-LABEL: select_undef_undef_F:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> poison, <vscale x 2 x i1> undef, <vscale x 2 x i1> %x, i32 %evl)
+  ret <vscale x 2 x i1> %a
+}
+
+define <vscale x 2 x i1> @select_unknown_undef_F(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, i32 zeroext %evl) {
+; CHECK-LABEL: select_unknown_undef_F:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> %x, <vscale x 2 x i1> undef, <vscale x 2 x i1> %y, i32 %evl)
+  ret <vscale x 2 x i1> %a
+}
+
+define <vscale x 2 x i1> @select_unknown_T_undef(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, i32 zeroext %evl) {
+; CHECK-LABEL: select_unknown_T_undef:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, <vscale x 2 x i1> poison, i32 %evl)
+  ret <vscale x 2 x i1> %a
+}
+
+define <vscale x 2 x i1> @select_false_T_F(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, <vscale x 2 x i1> %z, i32 zeroext %evl) {
+; CHECK-LABEL: select_false_T_F:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> %y, <vscale x 2 x i1> %z, i32 %evl)
+  ret <vscale x 2 x i1> %a
+}
+
+define <vscale x 2 x i1> @select_unknown_T_T(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, i32 zeroext %evl) {
+; CHECK-LABEL: select_unknown_T_T:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, <vscale x 2 x i1> %y, i32 %evl)
+  ret <vscale x 2 x i1> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
index 770bb56..082de2e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
@@ -627,3 +627,259 @@ define <vscale x 8 x i16> @vwsll_vi_nxv8i16(<vscale x 8 x i8> %a) {
   %z = shl <vscale x 8 x i16> %x, splat (i16 2)
   ret <vscale x 8 x i16> %z
 }
+
+; ==============================================================================
+; i8 -> i64
+; ==============================================================================
+
+define <vscale x 2 x i64> @vwsll_vv_nxv2i64_nxv2i8_sext(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b) {
+; CHECK-LABEL: vwsll_vv_nxv2i64_nxv2i8_sext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsext.vf8 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vv_nxv2i64_nxv2i8_sext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsext.vf8 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = sext <vscale x 2 x i8> %b to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vv_nxv2i64_nxv2i8_zext(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b) {
+; CHECK-LABEL: vwsll_vv_nxv2i64_nxv2i8_zext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vzext.vf8 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vv_nxv2i64_nxv2i8_zext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vzext.vf8 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = zext <vscale x 2 x i8> %b to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i64_nxv2i64_nxv2i8(<vscale x 2 x i8> %a, i64 %b) {
+; CHECK-LABEL: vwsll_vx_i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsll.vx v8, v10, a0
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i64_nxv2i64_nxv2i8:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsll.vx v8, v10, a0
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %splat
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i32_nxv2i64_nxv2i8_sext(<vscale x 2 x i8> %a, i32 %b) {
+; CHECK-LABEL: vwsll_vx_i32_nxv2i64_nxv2i8_sext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsext.vf2 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i32_nxv2i64_nxv2i8_sext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = sext <vscale x 2 x i32> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i32_nxv2i64_nxv2i8_zext(<vscale x 2 x i8> %a, i32 %b) {
+; CHECK-LABEL: vwsll_vx_i32_nxv2i64_nxv2i8_zext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vzext.vf2 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i32_nxv2i64_nxv2i8_zext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = zext <vscale x 2 x i32> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i16_nxv2i64_nxv2i8_sext(<vscale x 2 x i8> %a, i16 %b) {
+; CHECK-LABEL: vwsll_vx_i16_nxv2i64_nxv2i8_sext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsext.vf4 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i16_nxv2i64_nxv2i8_sext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsext.vf4 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i16> %head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = sext <vscale x 2 x i16> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i16_nxv2i64_nxv2i8_zext(<vscale x 2 x i8> %a, i16 %b) {
+; CHECK-LABEL: vwsll_vx_i16_nxv2i64_nxv2i8_zext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vzext.vf4 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i16_nxv2i64_nxv2i8_zext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vzext.vf4 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i16> %head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = zext <vscale x 2 x i16> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i8_nxv2i64_nxv2i8_sext(<vscale x 2 x i8> %a, i8 %b) {
+; CHECK-LABEL: vwsll_vx_i8_nxv2i64_nxv2i8_sext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsext.vf8 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i8_nxv2i64_nxv2i8_sext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsext.vf8 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i8> %head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = sext <vscale x 2 x i8> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i8_nxv2i64_nxv2i8_zext(<vscale x 2 x i8> %a, i8 %b) {
+; CHECK-LABEL: vwsll_vx_i8_nxv2i64_nxv2i8_zext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vzext.vf8 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i8_nxv2i64_nxv2i8_zext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vzext.vf8 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i8> %head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = zext <vscale x 2 x i8> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vi_nxv2i64_nxv2i8(<vscale x 2 x i8> %a) {
+; CHECK-LABEL: vwsll_vi_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsll.vi v8, v10, 2
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vi_nxv2i64_nxv2i8:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsll.vi v8, v10, 2
+; CHECK-ZVBB-NEXT:    ret
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, splat (i64 2)
+  ret <vscale x 2 x i64> %z
+}
diff --git a/llvm/test/CodeGen/SPARC/inlineasm-bad.ll b/llvm/test/CodeGen/SPARC/inlineasm-bad.ll
index 5bf2adb..07eb67d 100644
--- a/llvm/test/CodeGen/SPARC/inlineasm-bad.ll
+++ b/llvm/test/CodeGen/SPARC/inlineasm-bad.ll
@@ -11,3 +11,12 @@ entry:
   tail call void asm sideeffect "faddq $0,$1,$2", "{f38},{f0},{f0}"(fp128 0xL0, fp128 0xL0, fp128 0xL0)
   ret void
 }
+
+; CHECK-label:test_twinword_error
+; CHECK: error: Hi part of pair should point to an even-numbered register
+; CHECK: error: (note that in some cases it might be necessary to manually bind the input/output registers instead of relying on automatic allocation)
+
+define i64 @test_twinword_error(){
+  %1 = tail call i64 asm sideeffect "rd %asr5, ${0:L} \0A\09 srlx ${0:L}, 32, ${0:H}", "={i1}"()
+  ret i64 %1
+}
diff --git a/llvm/test/CodeGen/SPARC/inlineasm.ll b/llvm/test/CodeGen/SPARC/inlineasm.ll
index ec27598..9817d7c 100644
--- a/llvm/test/CodeGen/SPARC/inlineasm.ll
+++ b/llvm/test/CodeGen/SPARC/inlineasm.ll
@@ -143,3 +143,12 @@ entry:
   %1 = call double asm sideeffect "faddd $1, $2, $0", "=f,f,e"(i64 0, i64 0)
   ret void
 }
+
+; CHECK-label:test_twinword
+; CHECK: rd  %asr5, %i1
+; CHECK: srlx %i1, 32, %i0
+
+define i64 @test_twinword(){
+  %1 = tail call i64 asm sideeffect "rd %asr5, ${0:L} \0A\09 srlx ${0:L}, 32, ${0:H}", "={i0}"()
+  ret i64 %1
+}
diff --git a/llvm/test/CodeGen/SPIRV/OpVariable_order.ll b/llvm/test/CodeGen/SPIRV/OpVariable_order.ll
index a4ca3aa..6057bf38 100644
--- a/llvm/test/CodeGen/SPIRV/OpVariable_order.ll
+++ b/llvm/test/CodeGen/SPIRV/OpVariable_order.ll
@@ -1,10 +1,14 @@
-; REQUIRES: spirv-tools
-; RUN: llc -O0 -mtriple=spirv-unknown-linux %s -o - -filetype=obj | not spirv-val 2>&1 | FileCheck %s
+; All OpVariable instructions in a function must be the first instructions in the first block
 
-; TODO(#66261): The SPIR-V backend should reorder OpVariable instructions so this doesn't fail,
-;     but in the meantime it's a good example of the spirv-val tool working as intended.
+; RUN: llc -O0 -mtriple=spirv-unknown-linux %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-linux %s -o - -filetype=obj | spirv-val %}
 
-; CHECK: All OpVariable instructions in a function must be the first instructions in the first block.
+; CHECK-SPIRV: OpFunction
+; CHECK-SPIRV-NEXT: OpLabel
+; CHECK-SPIRV-NEXT: OpVariable
+; CHECK-SPIRV-NEXT: OpVariable
+; CHECK-SPIRV: OpReturn
+; CHECK-SPIRV: OpFunctionEnd
 
 define void @main() #1 {
 entry:
diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-chain.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-chain.ll
index 1071d34..b039f80 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-chain.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-chain.ll
@@ -10,22 +10,46 @@
 ; CHECK-SPIRV-DAG: OpName %[[FooObj:.*]] "foo_object"
 ; CHECK-SPIRV-DAG: OpName %[[FooMemOrder:.*]] "mem_order"
 ; CHECK-SPIRV-DAG: OpName %[[FooFunc:.*]] "foo"
+
 ; CHECK-SPIRV-DAG: %[[TyLong:.*]] = OpTypeInt 32 0
 ; CHECK-SPIRV-DAG: %[[TyVoid:.*]] = OpTypeVoid
+; CHECK-SPIRV-DAG: %[[TyGenPtrLong:.*]] = OpTypePointer Generic %[[TyLong]]
 ; CHECK-SPIRV-DAG: %[[TyPtrLong:.*]] = OpTypePointer CrossWorkgroup %[[TyLong]]
 ; CHECK-SPIRV-DAG: %[[TyFunPtrLong:.*]] = OpTypeFunction %[[TyVoid]] %[[TyPtrLong]]
-; CHECK-SPIRV-DAG: %[[TyGenPtrLong:.*]] = OpTypePointer Generic %[[TyLong]]
+; CHECK-SPIRV-DAG: %[[TyGenPtrPtrLong:.*]] = OpTypePointer Generic %[[TyGenPtrLong]]
 ; CHECK-SPIRV-DAG: %[[TyFunGenPtrLongLong:.*]] = OpTypeFunction %[[TyVoid]] %[[TyGenPtrLong]] %[[TyLong]]
+; CHECK-SPIRV-DAG: %[[TyChar:.*]] = OpTypeInt 8 0
+; CHECK-SPIRV-DAG: %[[TyGenPtrChar:.*]] = OpTypePointer Generic %[[TyChar]]
+; CHECK-SPIRV-DAG: %[[TyGenPtrPtrChar:.*]] = OpTypePointer Generic %[[TyGenPtrChar]]
+; CHECK-SPIRV-DAG: %[[TyFunPtrGenPtrChar:.*]] = OpTypePointer Function %[[TyGenPtrChar]]
 ; CHECK-SPIRV-DAG: %[[Const3:.*]] = OpConstant %[[TyLong]] 3
+
 ; CHECK-SPIRV: %[[FunTest]] = OpFunction %[[TyVoid]] None %[[TyFunPtrLong]]
 ; CHECK-SPIRV: %[[ArgCum]] = OpFunctionParameter %[[TyPtrLong]]
+
 ; CHECK-SPIRV: OpFunctionCall %[[TyVoid]] %[[FooFunc]] %[[Addr]] %[[Const3]]
+
+; CHECK-SPIRV: %[[HalfAddr:.*]] = OpPtrCastToGeneric
+; CHECK-SPIRV-NEXT: %[[HalfAddrCasted:.*]] = OpBitcast %[[TyGenPtrLong]] %[[HalfAddr]]
+; CHECK-SPIRV-NEXT: OpFunctionCall %[[TyVoid]] %[[FooFunc]] %[[HalfAddrCasted]] %[[Const3]]
+
+; CHECK-SPIRV: %[[DblAddr:.*]] = OpPtrCastToGeneric
+; CHECK-SPIRV-NEXT: %[[DblAddrCasted:.*]] = OpBitcast %[[TyGenPtrLong]] %[[DblAddr]]
+; CHECK-SPIRV-NEXT: OpFunctionCall %[[TyVoid]] %[[FooFunc]] %[[DblAddrCasted]] %[[Const3]]
+
 ; CHECK-SPIRV: %[[FooStub]] = OpFunction %[[TyVoid]] None %[[TyFunGenPtrLongLong]]
 ; CHECK-SPIRV: %[[StubObj]] = OpFunctionParameter %[[TyGenPtrLong]]
 ; CHECK-SPIRV: %[[MemOrder]] = OpFunctionParameter %[[TyLong]]
+
+; CHECK-SPIRV: %[[ObjectAddr:.*]] = OpVariable %[[TyFunPtrGenPtrChar]] Function
+; CHECK-SPIRV-NEXT: %[[ToGeneric:.*]] = OpPtrCastToGeneric %[[TyGenPtrPtrChar]] %[[ObjectAddr]]
+; CHECK-SPIRV-NEXT: %[[Casted:.*]] = OpBitcast %[[TyGenPtrPtrLong]] %[[ToGeneric]]
+; CHECK-SPIRV-NEXT: OpStore %[[Casted]] %[[StubObj]]
+
 ; CHECK-SPIRV: %[[FooFunc]] = OpFunction %[[TyVoid]] None %[[TyFunGenPtrLongLong]]
 ; CHECK-SPIRV: %[[FooObj]] = OpFunctionParameter %[[TyGenPtrLong]]
 ; CHECK-SPIRV: %[[FooMemOrder]] = OpFunctionParameter %[[TyLong]]
+
 ; CHECK-SPIRV: OpFunctionCall %[[TyVoid]] %[[FooStub]] %[[FooObj]] %[[FooMemOrder]]
 
 define spir_kernel void @test(ptr addrspace(1) noundef align 4 %_arg_cum) {
diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-call-no-bitcast.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-call-no-bitcast.ll
new file mode 100644
index 0000000..edb31ff
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-call-no-bitcast.ll
@@ -0,0 +1,60 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV-DAG: OpName %[[Foo:.*]] "foo"
+; CHECK-SPIRV-DAG: %[[TyChar:.*]] = OpTypeInt 8 0
+; CHECK-SPIRV-DAG: %[[TyVoid:.*]] = OpTypeVoid
+; CHECK-SPIRV-DAG: %[[TyGenPtrChar:.*]] = OpTypePointer Generic %[[TyChar]]
+; CHECK-SPIRV-DAG: %[[TyFunBar:.*]] = OpTypeFunction %[[TyVoid]] %[[TyGenPtrChar]]
+; CHECK-SPIRV-DAG: %[[TyLong:.*]] = OpTypeInt 64 0
+; CHECK-SPIRV-DAG: %[[TyGenPtrPtrChar:.*]] = OpTypePointer Generic %[[TyGenPtrChar]]
+; CHECK-SPIRV-DAG: %[[TyFunFoo:.*]] = OpTypeFunction %[[TyVoid]] %[[TyLong]] %[[TyGenPtrPtrChar]] %[[TyGenPtrPtrChar]]
+; CHECK-SPIRV-DAG: %[[TyStruct:.*]] = OpTypeStruct %[[TyLong]]
+; CHECK-SPIRV-DAG: %[[Const100:.*]] = OpConstant %[[TyLong]] 100
+; CHECK-SPIRV-DAG: %[[TyFunPtrGenPtrChar:.*]] = OpTypePointer Function %[[TyGenPtrChar]]
+; CHECK-SPIRV-DAG: %[[TyPtrStruct:.*]] = OpTypePointer Generic %[[TyStruct]]
+; CHECK-SPIRV-DAG: %[[TyPtrLong:.*]] = OpTypePointer Generic %[[TyLong]]
+
+; CHECK-SPIRV: %[[Bar:.*]] = OpFunction %[[TyVoid]] None %[[TyFunBar]]
+; CHECK-SPIRV: %[[BarArg:.*]] = OpFunctionParameter %[[TyGenPtrChar]]
+; CHECK-SPIRV-NEXT: OpLabel
+; CHECK-SPIRV-NEXT: OpVariable %[[TyFunPtrGenPtrChar]] Function
+; CHECK-SPIRV-NEXT: OpVariable %[[TyFunPtrGenPtrChar]] Function
+; CHECK-SPIRV-NEXT: OpVariable %[[TyFunPtrGenPtrChar]] Function
+; CHECK-SPIRV: %[[Var1:.*]] = OpPtrCastToGeneric %[[TyGenPtrPtrChar]] %[[#]]
+; CHECK-SPIRV: %[[Var2:.*]] = OpPtrCastToGeneric %[[TyGenPtrPtrChar]] %[[#]]
+; CHECK-SPIRV: OpStore %[[#]] %[[BarArg]]
+; CHECK-SPIRV-NEXT: OpFunctionCall %[[TyVoid]] %[[Foo]] %[[Const100]] %[[Var1]] %[[Var2]]
+; CHECK-SPIRV-NEXT: OpFunctionCall %[[TyVoid]] %[[Foo]] %[[Const100]] %[[Var2]] %[[Var1]]
+
+; CHECK-SPIRV: %[[Foo]] = OpFunction %[[TyVoid]] None %[[TyFunFoo]]
+; CHECK-SPIRV-NEXT: OpFunctionParameter %[[TyLong]]
+; CHECK-SPIRV-NEXT: OpFunctionParameter %[[TyGenPtrPtrChar]]
+; CHECK-SPIRV-NEXT: OpFunctionParameter %[[TyGenPtrPtrChar]]
+
+%class.CustomType = type { i64 }
+
+define linkonce_odr dso_local spir_func void @bar(ptr addrspace(4) noundef %first) {
+entry:
+  %first.addr = alloca ptr addrspace(4)
+  %first.addr.ascast = addrspacecast ptr %first.addr to ptr addrspace(4)
+  %temp = alloca ptr addrspace(4), align 8
+  %temp.ascast = addrspacecast ptr %temp to ptr addrspace(4)
+  store ptr addrspace(4) %first, ptr %first.addr
+  call spir_func void @foo(i64 noundef 100, ptr addrspace(4) noundef dereferenceable(8) %first.addr.ascast, ptr addrspace(4) noundef dereferenceable(8) %temp.ascast)
+  call spir_func void @foo(i64 noundef 100, ptr addrspace(4) noundef dereferenceable(8) %temp.ascast, ptr addrspace(4) noundef dereferenceable(8) %first.addr.ascast)
+  %var = alloca ptr addrspace(4), align 8
+  ret void
+}
+
+define linkonce_odr dso_local spir_func void @foo(i64 noundef %offset, ptr addrspace(4) noundef dereferenceable(8) %in_acc1, ptr addrspace(4) noundef dereferenceable(8) %out_acc1) {
+entry:
+  %r0 = load ptr addrspace(4), ptr addrspace(4) %in_acc1
+  %arrayidx = getelementptr inbounds %class.CustomType, ptr addrspace(4) %r0, i64 42
+  %r1 = load i64, ptr addrspace(4) %arrayidx
+  %r3 = load ptr addrspace(4), ptr addrspace(4) %out_acc1
+  %r4 = getelementptr %class.CustomType, ptr addrspace(4) %r3, i64 43
+  store i64 %r1, ptr addrspace(4) %r4
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/WebAssembly/multi-return.ll b/llvm/test/CodeGen/WebAssembly/multi-return.ll
index 3429cd5..293a1b3 100644
--- a/llvm/test/CodeGen/WebAssembly/multi-return.ll
+++ b/llvm/test/CodeGen/WebAssembly/multi-return.ll
@@ -78,18 +78,16 @@ define i64 @test4() {
 define { i64, i128 } @test5() {
 ; CHECK-LABEL: test5:
 ; CHECK: call    	return_multi_multi
-; CHECK: i32.const	$push8=, 8
-; CHECK: i32.add 	$push9=, $[[SP:[0-9]+]], $pop8
-; CHECK: i32.const	$push0=, 16
-; CHECK: i32.add 	$push1=, $pop9, $pop0
+; CHECK: i32.const	$push0=, 24
+; CHECK: i32.add 	$push1=, $[[SP:[0-9]+]], $pop0
 ; CHECK: i64.load	$[[L1:[0-9]+]]=, 0($pop1)
 ; CHECK: i64.load	$[[L2:[0-9]+]]=, 8($[[SP]])
 ; CHECK: i64.load	$push2=, 16($[[SP]])
 ; CHECK: i64.store	8($0), $pop2
+; CHECK: i64.store	16($0), $[[L1]]
 ; CHECK: i64.store	0($0), $[[L2]]
-; CHECK: i32.const	$push12=, 16
-; CHECK: i32.add 	$push3=, $0, $pop12
-; CHECK: i64.store	0($pop3), $[[L1]]
+; CHECK: i32.const	$push5=, 80
+; CHECK: i32.add 	$push6=, $3, $pop5
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %r0 = extractvalue { i64, i128, i192, i128, i64 } %t0, 0
   %r1 = extractvalue { i64, i128, i192, i128, i64 } %t0, 1
@@ -101,20 +99,20 @@ define { i64, i128 } @test5() {
 define { i128, i128 } @test6() {
 ; CHECK-LABEL: test6:
 ; CHECK: call    	return_multi_multi
-; CHECK: i32.const	$push0=, 64
+; CHECK: i32.const	$push0=, 24
 ; CHECK: i32.add 	$push1=, $[[SP:[0-9]+]], $pop0
 ; CHECK: i64.load	$[[L1:[0-9]+]]=, 0($pop1)
-; CHECK: i32.const	$push2=, 24
+; CHECK: i32.const	$push2=, 64
 ; CHECK: i32.add 	$push3=, $[[SP]], $pop2
 ; CHECK: i64.load	$[[L2:[0-9]+]]=, 0($pop3)
 ; CHECK: i64.load	$[[L3:[0-9]+]]=, 16($[[SP]])
 ; CHECK: i64.load	$push4=, 56($[[SP]])
 ; CHECK: i64.store	16($0), $pop4
+; CHECK: i64.store	24($0), $[[L2]]
 ; CHECK: i64.store	0($0), $[[L3]]
-; CHECK: i64.store	8($0), $[[L2]]
-; CHECK: i32.const	$push5=, 24
-; CHECK: i32.add 	$push6=, $0, $pop5
-; CHECK: i64.store	0($pop6), $[[L1]]
+; CHECK: i64.store	8($0), $[[L1]]
+; CHECK: i32.const	$push7=, 80
+; CHECK: i32.add	$push8=, $4, $pop7
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %r1 = extractvalue { i64, i128, i192, i128, i64 } %t0, 1
   %r3 = extractvalue { i64, i128, i192, i128, i64 } %t0, 3
@@ -129,19 +127,17 @@ define { i64, i192 } @test7() {
 ; CHECK: i32.const	$push0=, 40
 ; CHECK: i32.add 	$push1=, $[[SP:[0-9]+]], $pop0
 ; CHECK: i64.load	$[[L1:[0-9]+]]=, 0($pop1)
+; CHECK: i64.load	$[[L2:[0-9]+]]=, 8($[[SP]])
+; CHECK: i64.load	$[[L3:[0-9]+]]=, 32($[[SP]])
 ; CHECK: i32.const	$push2=, 48
 ; CHECK: i32.add 	$push3=, $[[SP]], $pop2
-; CHECK: i64.load	$[[L2:[0-9]+]]=, 0($pop3)
-; CHECK: i64.load	$[[L3:[0-9]+]]=, 8($[[SP]])
-; CHECK: i64.load	$push4=, 32($[[SP]])
-; CHECK: i64.store	8($0), $pop4
-; CHECK: i64.store	0($0), $[[L3]]
-; CHECK: i32.const	$push5=, 24
-; CHECK: i32.add 	$push6=, $0, $pop5
-; CHECK: i64.store	0($pop6), $[[L2]]
-; CHECK: i32.const	$push7=, 16
-; CHECK: i32.add 	$push8=, $0, $pop7
-; CHECK: i64.store	0($pop8), $[[L1]]
+; CHECK: i64.load	$push4=, 0($pop3)
+; CHECK: i64.store	24($0), $pop4
+; CHECK: i64.store	8($0), $[[L3]]
+; CHECK: i64.store	16($0), $[[L1]]
+; CHECK: i64.store	0($0), $[[L2]]
+; CHECK: i32.const	$push7=, 80
+; CHECK: i32.add 	$push8=, $4, $pop7
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %r0 = extractvalue { i64, i128, i192, i128, i64 } %t0, 0
   %r2 = extractvalue { i64, i128, i192, i128, i64 } %t0, 2
@@ -153,18 +149,16 @@ define { i64, i192 } @test7() {
 define { i128, i192, i128, i64 } @test8() {
 ; CHECK-LABEL: test8:
 ; CHECK: call    	return_multi_multi
-; CHECK: i32.const	$push18=, 8
-; CHECK: i32.add 	$push19=, $[[SP:[0-9]+]], $pop18
-; CHECK: i32.const	$push0=, 32
-; CHECK: i32.add 	$push1=, $pop19, $pop0
+; CHECK: i32.const	$push0=, 64
+; CHECK: i32.add 	$push1=, $[[SP:[0-9]+]], $pop0
 ; CHECK: i64.load	$[[L1:[0-9]+]]=, 0($pop1)
-; CHECK: i32.const	$push2=, 48
+; CHECK: i32.const	$push2=, 40
 ; CHECK: i32.add 	$push3=, $[[SP]], $pop2
 ; CHECK: i64.load	$[[L2:[0-9]+]]=, 0($pop3)
-; CHECK: i32.const	$push4=, 24
+; CHECK: i32.const	$push4=, 48
 ; CHECK: i32.add 	$push5=, $[[SP]], $pop4
 ; CHECK: i64.load	$[[L3:[0-9]+]]=, 0($pop5)
-; CHECK: i32.const	$push6=, 64
+; CHECK: i32.const	$push6=, 24
 ; CHECK: i32.add 	$push7=, $[[SP]], $pop6
 ; CHECK: i64.load	$[[L4:[0-9]+]]=, 0($pop7)
 ; CHECK: i64.load	$[[L5:[0-9]+]]=, 8($[[SP]])
@@ -172,19 +166,15 @@ define { i128, i192, i128, i64 } @test8() {
 ; CHECK: i64.load	$[[L7:[0-9]+]]=, 32($[[SP]])
 ; CHECK: i64.load	$push8=, 16($[[SP]])
 ; CHECK: i64.store	40($0), $pop8
+; CHECK: i64.store	48($0), $[[L4]]
+; CHECK: i64.store	32($0), $[[L3]]
 ; CHECK: i64.store	16($0), $[[L7]]
+; CHECK: i64.store	24($0), $[[L2]]
 ; CHECK: i64.store	0($0), $[[L6]]
-; CHECK: i64.store	8($0), $[[L4]]
+; CHECK: i64.store	8($0), $[[L1]]
 ; CHECK: i64.store	56($0), $[[L5]]
-; CHECK: i32.const	$push9=, 48
-; CHECK: i32.add 	$push10=, $0, $pop9
-; CHECK: i64.store	0($pop10), $[[L3]]
-; CHECK: i32.const	$push22=, 32
-; CHECK: i32.add 	$push11=, $0, $pop22
-; CHECK: i64.store	0($pop11), $[[L2]]
-; CHECK: i32.const	$push12=, 24
-; CHECK: i32.add 	$push13=, $0, $pop12
-; CHECK: i64.store	0($pop13), $[[L1]]
+; CHECK: i32.const	$push11=, 80
+; CHECK: i32.add 	$push12=, $8, $pop11
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %r0 = extractvalue { i64, i128, i192, i128, i64 } %t0, 0
   %r1 = extractvalue { i64, i128, i192, i128, i64 } %t0, 1
diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
index 3a806b9..761a754 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
@@ -31,60 +31,38 @@ define <16 x i8> @add_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: add_v16i8:
 ; NO-SIMD128:         .functype add_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.add $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.add $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.add $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.add $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.add $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.add $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.add $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.add $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.add $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.add $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.add $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.add $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.add $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.add $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.add $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.add $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.add $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.add $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.add $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.add $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.add $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.add $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.add $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.add $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.add $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.add $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.add $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.add $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.add $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.add $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.add $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: add_v16i8:
@@ -96,54 +74,32 @@ define <16 x i8> @add_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.add $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.add $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -165,60 +121,38 @@ define <16 x i8> @sub_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: sub_v16i8:
 ; NO-SIMD128:         .functype sub_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.sub $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.sub $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.sub $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.sub $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.sub $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.sub $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.sub $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.sub $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.sub $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.sub $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.sub $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.sub $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.sub $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.sub $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.sub $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.sub $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.sub $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.sub $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.sub $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.sub $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.sub $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.sub $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.sub $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.sub $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.sub $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.sub $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.sub $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.sub $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.sub $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.sub $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.sub $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: sub_v16i8:
@@ -230,54 +164,32 @@ define <16 x i8> @sub_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.sub $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.sub $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.sub $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.sub $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.sub $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.sub $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.sub $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.sub $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.sub $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.sub $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.sub $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.sub $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.sub $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -425,60 +337,38 @@ define <16 x i8> @mul_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: mul_v16i8:
 ; NO-SIMD128:         .functype mul_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.mul $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.mul $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.mul $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.mul $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.mul $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.mul $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.mul $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.mul $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.mul $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.mul $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.mul $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.mul $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.mul $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.mul $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.mul $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.mul $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.mul $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.mul $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.mul $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.mul $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.mul $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.mul $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.mul $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.mul $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.mul $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.mul $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.mul $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.mul $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.mul $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.mul $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.mul $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.mul $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: mul_v16i8:
@@ -490,54 +380,32 @@ define <16 x i8> @mul_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.mul $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.mul $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.mul $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.mul $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.mul $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.mul $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.mul $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.mul $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.mul $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.mul $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.mul $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.mul $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.mul $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.mul $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.mul $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = mul <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -559,108 +427,86 @@ define <16 x i8> @min_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: min_s_v16i8:
 ; NO-SIMD128:         .functype min_s_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 15
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $16
 ; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $32
 ; NO-SIMD128-NEXT:    i32.lt_s $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $16, $32, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $15
-; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $31
-; NO-SIMD128-NEXT:    i32.lt_s $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $15, $31, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push16=, 13
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $14
-; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $30
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $15
+; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $31
+; NO-SIMD128-NEXT:    i32.lt_s $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $15, $31, $pop6
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop7
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $14
+; NO-SIMD128-NEXT:    i32.extend8_s $push8=, $30
+; NO-SIMD128-NEXT:    i32.lt_s $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $14, $30, $pop10
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $13
+; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $29
 ; NO-SIMD128-NEXT:    i32.lt_s $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.select $push15=, $14, $30, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push22=, 12
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $13
-; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $29
-; NO-SIMD128-NEXT:    i32.lt_s $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.select $push21=, $13, $29, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push28=, 11
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $12
-; NO-SIMD128-NEXT:    i32.extend8_s $push24=, $28
+; NO-SIMD128-NEXT:    i32.select $push15=, $13, $29, $pop14
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $12
+; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $28
+; NO-SIMD128-NEXT:    i32.lt_s $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.select $push19=, $12, $28, $pop18
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop19
+; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $11
+; NO-SIMD128-NEXT:    i32.extend8_s $push20=, $27
+; NO-SIMD128-NEXT:    i32.lt_s $push22=, $pop21, $pop20
+; NO-SIMD128-NEXT:    i32.select $push23=, $11, $27, $pop22
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop23
+; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $10
+; NO-SIMD128-NEXT:    i32.extend8_s $push24=, $26
 ; NO-SIMD128-NEXT:    i32.lt_s $push26=, $pop25, $pop24
-; NO-SIMD128-NEXT:    i32.select $push27=, $12, $28, $pop26
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push34=, 10
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $11
-; NO-SIMD128-NEXT:    i32.extend8_s $push30=, $27
-; NO-SIMD128-NEXT:    i32.lt_s $push32=, $pop31, $pop30
-; NO-SIMD128-NEXT:    i32.select $push33=, $11, $27, $pop32
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push40=, 9
-; NO-SIMD128-NEXT:    i32.add $push41=, $0, $pop40
-; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $10
-; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $26
+; NO-SIMD128-NEXT:    i32.select $push27=, $10, $26, $pop26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend8_s $push29=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push28=, $25
+; NO-SIMD128-NEXT:    i32.lt_s $push30=, $pop29, $pop28
+; NO-SIMD128-NEXT:    i32.select $push31=, $9, $25, $pop30
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop31
+; NO-SIMD128-NEXT:    i32.extend8_s $push33=, $8
+; NO-SIMD128-NEXT:    i32.extend8_s $push32=, $24
+; NO-SIMD128-NEXT:    i32.lt_s $push34=, $pop33, $pop32
+; NO-SIMD128-NEXT:    i32.select $push35=, $8, $24, $pop34
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop35
+; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $7
+; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $23
 ; NO-SIMD128-NEXT:    i32.lt_s $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.select $push39=, $10, $26, $pop38
-; NO-SIMD128-NEXT:    i32.store8 0($pop41), $pop39
-; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $9
-; NO-SIMD128-NEXT:    i32.extend8_s $push42=, $25
-; NO-SIMD128-NEXT:    i32.lt_s $push44=, $pop43, $pop42
-; NO-SIMD128-NEXT:    i32.select $push45=, $9, $25, $pop44
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop45
-; NO-SIMD128-NEXT:    i32.const $push50=, 7
-; NO-SIMD128-NEXT:    i32.add $push51=, $0, $pop50
-; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $8
-; NO-SIMD128-NEXT:    i32.extend8_s $push46=, $24
-; NO-SIMD128-NEXT:    i32.lt_s $push48=, $pop47, $pop46
-; NO-SIMD128-NEXT:    i32.select $push49=, $8, $24, $pop48
-; NO-SIMD128-NEXT:    i32.store8 0($pop51), $pop49
-; NO-SIMD128-NEXT:    i32.const $push56=, 6
-; NO-SIMD128-NEXT:    i32.add $push57=, $0, $pop56
-; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $7
-; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $23
+; NO-SIMD128-NEXT:    i32.select $push39=, $7, $23, $pop38
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop39
+; NO-SIMD128-NEXT:    i32.extend8_s $push41=, $6
+; NO-SIMD128-NEXT:    i32.extend8_s $push40=, $22
+; NO-SIMD128-NEXT:    i32.lt_s $push42=, $pop41, $pop40
+; NO-SIMD128-NEXT:    i32.select $push43=, $6, $22, $pop42
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop43
+; NO-SIMD128-NEXT:    i32.extend8_s $push45=, $5
+; NO-SIMD128-NEXT:    i32.extend8_s $push44=, $21
+; NO-SIMD128-NEXT:    i32.lt_s $push46=, $pop45, $pop44
+; NO-SIMD128-NEXT:    i32.select $push47=, $5, $21, $pop46
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop47
+; NO-SIMD128-NEXT:    i32.extend8_s $push49=, $4
+; NO-SIMD128-NEXT:    i32.extend8_s $push48=, $20
+; NO-SIMD128-NEXT:    i32.lt_s $push50=, $pop49, $pop48
+; NO-SIMD128-NEXT:    i32.select $push51=, $4, $20, $pop50
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop51
+; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $3
+; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $19
 ; NO-SIMD128-NEXT:    i32.lt_s $push54=, $pop53, $pop52
-; NO-SIMD128-NEXT:    i32.select $push55=, $7, $23, $pop54
-; NO-SIMD128-NEXT:    i32.store8 0($pop57), $pop55
-; NO-SIMD128-NEXT:    i32.const $push62=, 5
-; NO-SIMD128-NEXT:    i32.add $push63=, $0, $pop62
-; NO-SIMD128-NEXT:    i32.extend8_s $push59=, $6
-; NO-SIMD128-NEXT:    i32.extend8_s $push58=, $22
-; NO-SIMD128-NEXT:    i32.lt_s $push60=, $pop59, $pop58
-; NO-SIMD128-NEXT:    i32.select $push61=, $6, $22, $pop60
-; NO-SIMD128-NEXT:    i32.store8 0($pop63), $pop61
-; NO-SIMD128-NEXT:    i32.extend8_s $push65=, $5
-; NO-SIMD128-NEXT:    i32.extend8_s $push64=, $21
-; NO-SIMD128-NEXT:    i32.lt_s $push66=, $pop65, $pop64
-; NO-SIMD128-NEXT:    i32.select $push67=, $5, $21, $pop66
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop67
-; NO-SIMD128-NEXT:    i32.const $push72=, 3
-; NO-SIMD128-NEXT:    i32.add $push73=, $0, $pop72
-; NO-SIMD128-NEXT:    i32.extend8_s $push69=, $4
-; NO-SIMD128-NEXT:    i32.extend8_s $push68=, $20
-; NO-SIMD128-NEXT:    i32.lt_s $push70=, $pop69, $pop68
-; NO-SIMD128-NEXT:    i32.select $push71=, $4, $20, $pop70
-; NO-SIMD128-NEXT:    i32.store8 0($pop73), $pop71
-; NO-SIMD128-NEXT:    i32.extend8_s $push75=, $3
-; NO-SIMD128-NEXT:    i32.extend8_s $push74=, $19
-; NO-SIMD128-NEXT:    i32.lt_s $push76=, $pop75, $pop74
-; NO-SIMD128-NEXT:    i32.select $push77=, $3, $19, $pop76
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop77
-; NO-SIMD128-NEXT:    i32.extend8_s $push79=, $2
-; NO-SIMD128-NEXT:    i32.extend8_s $push78=, $18
-; NO-SIMD128-NEXT:    i32.lt_s $push80=, $pop79, $pop78
-; NO-SIMD128-NEXT:    i32.select $push81=, $2, $18, $pop80
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop81
-; NO-SIMD128-NEXT:    i32.extend8_s $push83=, $1
-; NO-SIMD128-NEXT:    i32.extend8_s $push82=, $17
-; NO-SIMD128-NEXT:    i32.lt_s $push84=, $pop83, $pop82
-; NO-SIMD128-NEXT:    i32.select $push85=, $1, $17, $pop84
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop85
+; NO-SIMD128-NEXT:    i32.select $push55=, $3, $19, $pop54
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop55
+; NO-SIMD128-NEXT:    i32.extend8_s $push57=, $2
+; NO-SIMD128-NEXT:    i32.extend8_s $push56=, $18
+; NO-SIMD128-NEXT:    i32.lt_s $push58=, $pop57, $pop56
+; NO-SIMD128-NEXT:    i32.select $push59=, $2, $18, $pop58
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop59
+; NO-SIMD128-NEXT:    i32.extend8_s $push61=, $1
+; NO-SIMD128-NEXT:    i32.extend8_s $push60=, $17
+; NO-SIMD128-NEXT:    i32.lt_s $push62=, $pop61, $pop60
+; NO-SIMD128-NEXT:    i32.select $push63=, $1, $17, $pop62
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop63
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_s_v16i8:
@@ -681,93 +527,71 @@ define <16 x i8> @min_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $3, $19, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $4
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $20
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $4, $20, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $5
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $21
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.select $push21=, $5, $21, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $6
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push22=, $22
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push24=, $pop23, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.select $push25=, $6, $22, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $7
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push17=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $21
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.select $push19=, $5, $21, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push20=, $22
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push22=, $pop21, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.select $push23=, $6, $22, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push24=, $23
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push26=, $pop25, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.select $push27=, $7, $23, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $24
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $7, $23, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push35=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push34=, $24
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push36=, $pop35, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.select $push37=, $8, $24, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop39), $pop37
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $9
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $25
+; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $8, $24, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push33=, $9
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push32=, $25
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push34=, $pop33, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.select $push35=, $9, $25, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push37=, $10
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push36=, $26
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push38=, $pop37, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.select $push39=, $10, $26, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop39
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $11
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $27
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push42=, $pop41, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.select $push43=, $9, $25, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop43
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $0, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $10
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $26
+; NO-SIMD128-FAST-NEXT:    i32.select $push43=, $11, $27, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop43
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $28
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push46=, $pop45, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.select $push47=, $10, $26, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop49), $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $0, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push51=, $11
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push50=, $27
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push52=, $pop51, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.select $push53=, $11, $27, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop55), $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $0, $pop60
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push57=, $12
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push56=, $28
+; NO-SIMD128-FAST-NEXT:    i32.select $push47=, $12, $28, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop47
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push49=, $13
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push48=, $29
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push50=, $pop49, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.select $push51=, $13, $29, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop51
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push53=, $14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push52=, $30
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push54=, $pop53, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.select $push55=, $14, $30, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop55
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push57=, $15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push56=, $31
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push58=, $pop57, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.select $push59=, $12, $28, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop61), $pop59
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $0, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push63=, $13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push62=, $29
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push64=, $pop63, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.select $push65=, $13, $29, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop67), $pop65
-; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push73=, $0, $pop72
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push69=, $14
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push68=, $30
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push70=, $pop69, $pop68
-; NO-SIMD128-FAST-NEXT:    i32.select $push71=, $14, $30, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop73), $pop71
-; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $0, $pop78
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push75=, $15
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push74=, $31
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push76=, $pop75, $pop74
-; NO-SIMD128-FAST-NEXT:    i32.select $push77=, $15, $31, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop79), $pop77
-; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push85=, $0, $pop84
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push81=, $16
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push80=, $32
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push82=, $pop81, $pop80
-; NO-SIMD128-FAST-NEXT:    i32.select $push83=, $16, $32, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop85), $pop83
+; NO-SIMD128-FAST-NEXT:    i32.select $push59=, $15, $31, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop59
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push61=, $16
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push60=, $32
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push62=, $pop61, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.select $push63=, $16, $32, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop63
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp slt <16 x i8> %x, %y
   %a = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
@@ -790,140 +614,118 @@ define <16 x i8> @min_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: min_u_v16i8:
 ; NO-SIMD128:         .functype min_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 15
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop0
-; NO-SIMD128-NEXT:    i32.const $push117=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop117
+; NO-SIMD128-NEXT:    i32.const $push95=, 255
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop95
 ; NO-SIMD128-NEXT:    i32.lt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.select $push4=, $16, $32, $pop3
-; NO-SIMD128-NEXT:    i32.store8 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push116=, 255
-; NO-SIMD128-NEXT:    i32.and $push8=, $15, $pop116
-; NO-SIMD128-NEXT:    i32.const $push115=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $31, $pop115
-; NO-SIMD128-NEXT:    i32.lt_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.select $push10=, $15, $31, $pop9
-; NO-SIMD128-NEXT:    i32.store8 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 13
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push114=, 255
-; NO-SIMD128-NEXT:    i32.and $push14=, $14, $pop114
-; NO-SIMD128-NEXT:    i32.const $push113=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $30, $pop113
-; NO-SIMD128-NEXT:    i32.lt_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.select $push16=, $14, $30, $pop15
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push23=, 12
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.const $push112=, 255
-; NO-SIMD128-NEXT:    i32.and $push20=, $13, $pop112
-; NO-SIMD128-NEXT:    i32.const $push111=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $29, $pop111
-; NO-SIMD128-NEXT:    i32.lt_u $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.select $push22=, $13, $29, $pop21
-; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push110=, 255
-; NO-SIMD128-NEXT:    i32.and $push26=, $12, $pop110
-; NO-SIMD128-NEXT:    i32.const $push109=, 255
-; NO-SIMD128-NEXT:    i32.and $push25=, $28, $pop109
-; NO-SIMD128-NEXT:    i32.lt_u $push27=, $pop26, $pop25
-; NO-SIMD128-NEXT:    i32.select $push28=, $12, $28, $pop27
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push35=, 10
-; NO-SIMD128-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-NEXT:    i32.const $push108=, 255
-; NO-SIMD128-NEXT:    i32.and $push32=, $11, $pop108
-; NO-SIMD128-NEXT:    i32.const $push107=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $27, $pop107
-; NO-SIMD128-NEXT:    i32.lt_u $push33=, $pop32, $pop31
-; NO-SIMD128-NEXT:    i32.select $push34=, $11, $27, $pop33
-; NO-SIMD128-NEXT:    i32.store8 0($pop36), $pop34
-; NO-SIMD128-NEXT:    i32.const $push41=, 9
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.const $push106=, 255
-; NO-SIMD128-NEXT:    i32.and $push38=, $10, $pop106
-; NO-SIMD128-NEXT:    i32.const $push105=, 255
-; NO-SIMD128-NEXT:    i32.and $push37=, $26, $pop105
-; NO-SIMD128-NEXT:    i32.lt_u $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.select $push40=, $10, $26, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push104=, 255
-; NO-SIMD128-NEXT:    i32.and $push44=, $9, $pop104
-; NO-SIMD128-NEXT:    i32.const $push103=, 255
-; NO-SIMD128-NEXT:    i32.and $push43=, $25, $pop103
-; NO-SIMD128-NEXT:    i32.lt_u $push45=, $pop44, $pop43
-; NO-SIMD128-NEXT:    i32.select $push46=, $9, $25, $pop45
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop46
-; NO-SIMD128-NEXT:    i32.const $push51=, 7
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.const $push102=, 255
-; NO-SIMD128-NEXT:    i32.and $push48=, $8, $pop102
-; NO-SIMD128-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-NEXT:    i32.and $push47=, $24, $pop101
-; NO-SIMD128-NEXT:    i32.lt_u $push49=, $pop48, $pop47
-; NO-SIMD128-NEXT:    i32.select $push50=, $8, $24, $pop49
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.const $push57=, 6
-; NO-SIMD128-NEXT:    i32.add $push58=, $0, $pop57
-; NO-SIMD128-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-NEXT:    i32.and $push54=, $7, $pop100
-; NO-SIMD128-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-NEXT:    i32.and $push53=, $23, $pop99
-; NO-SIMD128-NEXT:    i32.lt_u $push55=, $pop54, $pop53
-; NO-SIMD128-NEXT:    i32.select $push56=, $7, $23, $pop55
-; NO-SIMD128-NEXT:    i32.store8 0($pop58), $pop56
-; NO-SIMD128-NEXT:    i32.const $push63=, 5
-; NO-SIMD128-NEXT:    i32.add $push64=, $0, $pop63
-; NO-SIMD128-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-NEXT:    i32.and $push60=, $6, $pop98
-; NO-SIMD128-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-NEXT:    i32.and $push59=, $22, $pop97
-; NO-SIMD128-NEXT:    i32.lt_u $push61=, $pop60, $pop59
-; NO-SIMD128-NEXT:    i32.select $push62=, $6, $22, $pop61
-; NO-SIMD128-NEXT:    i32.store8 0($pop64), $pop62
-; NO-SIMD128-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-NEXT:    i32.and $push66=, $5, $pop96
-; NO-SIMD128-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-NEXT:    i32.and $push65=, $21, $pop95
-; NO-SIMD128-NEXT:    i32.lt_u $push67=, $pop66, $pop65
-; NO-SIMD128-NEXT:    i32.select $push68=, $5, $21, $pop67
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop68
-; NO-SIMD128-NEXT:    i32.const $push73=, 3
-; NO-SIMD128-NEXT:    i32.add $push74=, $0, $pop73
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop4
 ; NO-SIMD128-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-NEXT:    i32.and $push70=, $4, $pop94
+; NO-SIMD128-NEXT:    i32.and $push6=, $15, $pop94
 ; NO-SIMD128-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-NEXT:    i32.and $push69=, $20, $pop93
-; NO-SIMD128-NEXT:    i32.lt_u $push71=, $pop70, $pop69
-; NO-SIMD128-NEXT:    i32.select $push72=, $4, $20, $pop71
-; NO-SIMD128-NEXT:    i32.store8 0($pop74), $pop72
+; NO-SIMD128-NEXT:    i32.and $push5=, $31, $pop93
+; NO-SIMD128-NEXT:    i32.lt_u $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.select $push8=, $15, $31, $pop7
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop8
 ; NO-SIMD128-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-NEXT:    i32.and $push76=, $3, $pop92
+; NO-SIMD128-NEXT:    i32.and $push10=, $14, $pop92
 ; NO-SIMD128-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-NEXT:    i32.and $push75=, $19, $pop91
-; NO-SIMD128-NEXT:    i32.lt_u $push77=, $pop76, $pop75
-; NO-SIMD128-NEXT:    i32.select $push78=, $3, $19, $pop77
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop78
+; NO-SIMD128-NEXT:    i32.and $push9=, $30, $pop91
+; NO-SIMD128-NEXT:    i32.lt_u $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.select $push12=, $14, $30, $pop11
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop12
 ; NO-SIMD128-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-NEXT:    i32.and $push80=, $2, $pop90
+; NO-SIMD128-NEXT:    i32.and $push14=, $13, $pop90
 ; NO-SIMD128-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-NEXT:    i32.and $push79=, $18, $pop89
-; NO-SIMD128-NEXT:    i32.lt_u $push81=, $pop80, $pop79
-; NO-SIMD128-NEXT:    i32.select $push82=, $2, $18, $pop81
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop82
+; NO-SIMD128-NEXT:    i32.and $push13=, $29, $pop89
+; NO-SIMD128-NEXT:    i32.lt_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.select $push16=, $13, $29, $pop15
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop16
 ; NO-SIMD128-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-NEXT:    i32.and $push84=, $1, $pop88
+; NO-SIMD128-NEXT:    i32.and $push18=, $12, $pop88
 ; NO-SIMD128-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-NEXT:    i32.and $push83=, $17, $pop87
-; NO-SIMD128-NEXT:    i32.lt_u $push85=, $pop84, $pop83
-; NO-SIMD128-NEXT:    i32.select $push86=, $1, $17, $pop85
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop86
+; NO-SIMD128-NEXT:    i32.and $push17=, $28, $pop87
+; NO-SIMD128-NEXT:    i32.lt_u $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.select $push20=, $12, $28, $pop19
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push86=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $11, $pop86
+; NO-SIMD128-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-NEXT:    i32.and $push21=, $27, $pop85
+; NO-SIMD128-NEXT:    i32.lt_u $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.select $push24=, $11, $27, $pop23
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-NEXT:    i32.and $push26=, $10, $pop84
+; NO-SIMD128-NEXT:    i32.const $push83=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $26, $pop83
+; NO-SIMD128-NEXT:    i32.lt_u $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.select $push28=, $10, $26, $pop27
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push82=, 255
+; NO-SIMD128-NEXT:    i32.and $push30=, $9, $pop82
+; NO-SIMD128-NEXT:    i32.const $push81=, 255
+; NO-SIMD128-NEXT:    i32.and $push29=, $25, $pop81
+; NO-SIMD128-NEXT:    i32.lt_u $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.select $push32=, $9, $25, $pop31
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop32
+; NO-SIMD128-NEXT:    i32.const $push80=, 255
+; NO-SIMD128-NEXT:    i32.and $push34=, $8, $pop80
+; NO-SIMD128-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-NEXT:    i32.and $push33=, $24, $pop79
+; NO-SIMD128-NEXT:    i32.lt_u $push35=, $pop34, $pop33
+; NO-SIMD128-NEXT:    i32.select $push36=, $8, $24, $pop35
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop36
+; NO-SIMD128-NEXT:    i32.const $push78=, 255
+; NO-SIMD128-NEXT:    i32.and $push38=, $7, $pop78
+; NO-SIMD128-NEXT:    i32.const $push77=, 255
+; NO-SIMD128-NEXT:    i32.and $push37=, $23, $pop77
+; NO-SIMD128-NEXT:    i32.lt_u $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.select $push40=, $7, $23, $pop39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop40
+; NO-SIMD128-NEXT:    i32.const $push76=, 255
+; NO-SIMD128-NEXT:    i32.and $push42=, $6, $pop76
+; NO-SIMD128-NEXT:    i32.const $push75=, 255
+; NO-SIMD128-NEXT:    i32.and $push41=, $22, $pop75
+; NO-SIMD128-NEXT:    i32.lt_u $push43=, $pop42, $pop41
+; NO-SIMD128-NEXT:    i32.select $push44=, $6, $22, $pop43
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop44
+; NO-SIMD128-NEXT:    i32.const $push74=, 255
+; NO-SIMD128-NEXT:    i32.and $push46=, $5, $pop74
+; NO-SIMD128-NEXT:    i32.const $push73=, 255
+; NO-SIMD128-NEXT:    i32.and $push45=, $21, $pop73
+; NO-SIMD128-NEXT:    i32.lt_u $push47=, $pop46, $pop45
+; NO-SIMD128-NEXT:    i32.select $push48=, $5, $21, $pop47
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop48
+; NO-SIMD128-NEXT:    i32.const $push72=, 255
+; NO-SIMD128-NEXT:    i32.and $push50=, $4, $pop72
+; NO-SIMD128-NEXT:    i32.const $push71=, 255
+; NO-SIMD128-NEXT:    i32.and $push49=, $20, $pop71
+; NO-SIMD128-NEXT:    i32.lt_u $push51=, $pop50, $pop49
+; NO-SIMD128-NEXT:    i32.select $push52=, $4, $20, $pop51
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop52
+; NO-SIMD128-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-NEXT:    i32.and $push54=, $3, $pop70
+; NO-SIMD128-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-NEXT:    i32.and $push53=, $19, $pop69
+; NO-SIMD128-NEXT:    i32.lt_u $push55=, $pop54, $pop53
+; NO-SIMD128-NEXT:    i32.select $push56=, $3, $19, $pop55
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop56
+; NO-SIMD128-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-NEXT:    i32.and $push58=, $2, $pop68
+; NO-SIMD128-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-NEXT:    i32.and $push57=, $18, $pop67
+; NO-SIMD128-NEXT:    i32.lt_u $push59=, $pop58, $pop57
+; NO-SIMD128-NEXT:    i32.select $push60=, $2, $18, $pop59
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop60
+; NO-SIMD128-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-NEXT:    i32.and $push62=, $1, $pop66
+; NO-SIMD128-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-NEXT:    i32.and $push61=, $17, $pop65
+; NO-SIMD128-NEXT:    i32.lt_u $push63=, $pop62, $pop61
+; NO-SIMD128-NEXT:    i32.select $push64=, $1, $17, $pop63
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop64
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_u_v16i8:
@@ -931,138 +733,116 @@ define <16 x i8> @min_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop117
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop95
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.select $push4=, $1, $17, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop116
-; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $18, $pop115
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $18, $pop93
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.select $push8=, $2, $18, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop114
-; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $19, $pop113
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $19, $pop91
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.select $push12=, $3, $19, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop112
-; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $20, $pop111
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $20, $pop89
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.select $push16=, $4, $20, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $5, $pop110
-; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $21, $pop109
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.select $push22=, $5, $21, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $6, $pop108
-; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $22, $pop107
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.select $push26=, $6, $22, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $7, $pop106
-; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $23, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $5, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $21, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.select $push20=, $5, $21, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $6, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $22, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.select $push24=, $6, $22, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $7, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $23, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.select $push28=, $7, $23, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $8, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $24, $pop81
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $7, $23, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $8, $pop104
-; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $24, $pop103
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.select $push38=, $8, $24, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $9, $pop102
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $25, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $8, $24, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $9, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $25, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push35=, $pop34, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.select $push36=, $9, $25, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $10, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $26, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.select $push40=, $10, $26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop40
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $11, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $27, $pop75
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push43=, $pop42, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.select $push44=, $9, $25, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $10, $pop100
-; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $26, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.select $push44=, $11, $27, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $12, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $28, $pop73
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push47=, $pop46, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.select $push48=, $10, $26, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $11, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $27, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push53=, $pop52, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.select $push54=, $11, $27, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $12, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $28, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.select $push48=, $12, $28, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $13, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $29, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push51=, $pop50, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.select $push52=, $13, $29, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push54=, $14, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $30, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push55=, $pop54, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.select $push56=, $14, $30, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop56
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $15, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $31, $pop67
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push59=, $pop58, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.select $push60=, $12, $28, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push68=, $0, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push64=, $13, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $29, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push65=, $pop64, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.select $push66=, $13, $29, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop68), $pop66
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push70=, $14, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push69=, $30, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push71=, $pop70, $pop69
-; NO-SIMD128-FAST-NEXT:    i32.select $push72=, $14, $30, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $0, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push76=, $15, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push75=, $31, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push77=, $pop76, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.select $push78=, $15, $31, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop80), $pop78
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push86=, $0, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push82=, $16, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push81=, $32, $pop87
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push83=, $pop82, $pop81
-; NO-SIMD128-FAST-NEXT:    i32.select $push84=, $16, $32, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop86), $pop84
+; NO-SIMD128-FAST-NEXT:    i32.select $push60=, $15, $31, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $16, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push61=, $32, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push63=, $pop62, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.select $push64=, $16, $32, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop64
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ult <16 x i8> %x, %y
   %a = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
@@ -1085,108 +865,86 @@ define <16 x i8> @max_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: max_s_v16i8:
 ; NO-SIMD128:         .functype max_s_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 15
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $16
 ; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $32
 ; NO-SIMD128-NEXT:    i32.gt_s $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $16, $32, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $15
-; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $31
-; NO-SIMD128-NEXT:    i32.gt_s $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $15, $31, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push16=, 13
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $14
-; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $30
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $15
+; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $31
+; NO-SIMD128-NEXT:    i32.gt_s $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $15, $31, $pop6
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop7
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $14
+; NO-SIMD128-NEXT:    i32.extend8_s $push8=, $30
+; NO-SIMD128-NEXT:    i32.gt_s $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $14, $30, $pop10
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $13
+; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $29
 ; NO-SIMD128-NEXT:    i32.gt_s $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.select $push15=, $14, $30, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push22=, 12
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $13
-; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $29
-; NO-SIMD128-NEXT:    i32.gt_s $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.select $push21=, $13, $29, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push28=, 11
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $12
-; NO-SIMD128-NEXT:    i32.extend8_s $push24=, $28
+; NO-SIMD128-NEXT:    i32.select $push15=, $13, $29, $pop14
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $12
+; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $28
+; NO-SIMD128-NEXT:    i32.gt_s $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.select $push19=, $12, $28, $pop18
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop19
+; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $11
+; NO-SIMD128-NEXT:    i32.extend8_s $push20=, $27
+; NO-SIMD128-NEXT:    i32.gt_s $push22=, $pop21, $pop20
+; NO-SIMD128-NEXT:    i32.select $push23=, $11, $27, $pop22
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop23
+; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $10
+; NO-SIMD128-NEXT:    i32.extend8_s $push24=, $26
 ; NO-SIMD128-NEXT:    i32.gt_s $push26=, $pop25, $pop24
-; NO-SIMD128-NEXT:    i32.select $push27=, $12, $28, $pop26
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push34=, 10
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $11
-; NO-SIMD128-NEXT:    i32.extend8_s $push30=, $27
-; NO-SIMD128-NEXT:    i32.gt_s $push32=, $pop31, $pop30
-; NO-SIMD128-NEXT:    i32.select $push33=, $11, $27, $pop32
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push40=, 9
-; NO-SIMD128-NEXT:    i32.add $push41=, $0, $pop40
-; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $10
-; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $26
+; NO-SIMD128-NEXT:    i32.select $push27=, $10, $26, $pop26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend8_s $push29=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push28=, $25
+; NO-SIMD128-NEXT:    i32.gt_s $push30=, $pop29, $pop28
+; NO-SIMD128-NEXT:    i32.select $push31=, $9, $25, $pop30
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop31
+; NO-SIMD128-NEXT:    i32.extend8_s $push33=, $8
+; NO-SIMD128-NEXT:    i32.extend8_s $push32=, $24
+; NO-SIMD128-NEXT:    i32.gt_s $push34=, $pop33, $pop32
+; NO-SIMD128-NEXT:    i32.select $push35=, $8, $24, $pop34
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop35
+; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $7
+; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $23
 ; NO-SIMD128-NEXT:    i32.gt_s $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.select $push39=, $10, $26, $pop38
-; NO-SIMD128-NEXT:    i32.store8 0($pop41), $pop39
-; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $9
-; NO-SIMD128-NEXT:    i32.extend8_s $push42=, $25
-; NO-SIMD128-NEXT:    i32.gt_s $push44=, $pop43, $pop42
-; NO-SIMD128-NEXT:    i32.select $push45=, $9, $25, $pop44
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop45
-; NO-SIMD128-NEXT:    i32.const $push50=, 7
-; NO-SIMD128-NEXT:    i32.add $push51=, $0, $pop50
-; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $8
-; NO-SIMD128-NEXT:    i32.extend8_s $push46=, $24
-; NO-SIMD128-NEXT:    i32.gt_s $push48=, $pop47, $pop46
-; NO-SIMD128-NEXT:    i32.select $push49=, $8, $24, $pop48
-; NO-SIMD128-NEXT:    i32.store8 0($pop51), $pop49
-; NO-SIMD128-NEXT:    i32.const $push56=, 6
-; NO-SIMD128-NEXT:    i32.add $push57=, $0, $pop56
-; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $7
-; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $23
+; NO-SIMD128-NEXT:    i32.select $push39=, $7, $23, $pop38
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop39
+; NO-SIMD128-NEXT:    i32.extend8_s $push41=, $6
+; NO-SIMD128-NEXT:    i32.extend8_s $push40=, $22
+; NO-SIMD128-NEXT:    i32.gt_s $push42=, $pop41, $pop40
+; NO-SIMD128-NEXT:    i32.select $push43=, $6, $22, $pop42
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop43
+; NO-SIMD128-NEXT:    i32.extend8_s $push45=, $5
+; NO-SIMD128-NEXT:    i32.extend8_s $push44=, $21
+; NO-SIMD128-NEXT:    i32.gt_s $push46=, $pop45, $pop44
+; NO-SIMD128-NEXT:    i32.select $push47=, $5, $21, $pop46
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop47
+; NO-SIMD128-NEXT:    i32.extend8_s $push49=, $4
+; NO-SIMD128-NEXT:    i32.extend8_s $push48=, $20
+; NO-SIMD128-NEXT:    i32.gt_s $push50=, $pop49, $pop48
+; NO-SIMD128-NEXT:    i32.select $push51=, $4, $20, $pop50
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop51
+; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $3
+; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $19
 ; NO-SIMD128-NEXT:    i32.gt_s $push54=, $pop53, $pop52
-; NO-SIMD128-NEXT:    i32.select $push55=, $7, $23, $pop54
-; NO-SIMD128-NEXT:    i32.store8 0($pop57), $pop55
-; NO-SIMD128-NEXT:    i32.const $push62=, 5
-; NO-SIMD128-NEXT:    i32.add $push63=, $0, $pop62
-; NO-SIMD128-NEXT:    i32.extend8_s $push59=, $6
-; NO-SIMD128-NEXT:    i32.extend8_s $push58=, $22
-; NO-SIMD128-NEXT:    i32.gt_s $push60=, $pop59, $pop58
-; NO-SIMD128-NEXT:    i32.select $push61=, $6, $22, $pop60
-; NO-SIMD128-NEXT:    i32.store8 0($pop63), $pop61
-; NO-SIMD128-NEXT:    i32.extend8_s $push65=, $5
-; NO-SIMD128-NEXT:    i32.extend8_s $push64=, $21
-; NO-SIMD128-NEXT:    i32.gt_s $push66=, $pop65, $pop64
-; NO-SIMD128-NEXT:    i32.select $push67=, $5, $21, $pop66
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop67
-; NO-SIMD128-NEXT:    i32.const $push72=, 3
-; NO-SIMD128-NEXT:    i32.add $push73=, $0, $pop72
-; NO-SIMD128-NEXT:    i32.extend8_s $push69=, $4
-; NO-SIMD128-NEXT:    i32.extend8_s $push68=, $20
-; NO-SIMD128-NEXT:    i32.gt_s $push70=, $pop69, $pop68
-; NO-SIMD128-NEXT:    i32.select $push71=, $4, $20, $pop70
-; NO-SIMD128-NEXT:    i32.store8 0($pop73), $pop71
-; NO-SIMD128-NEXT:    i32.extend8_s $push75=, $3
-; NO-SIMD128-NEXT:    i32.extend8_s $push74=, $19
-; NO-SIMD128-NEXT:    i32.gt_s $push76=, $pop75, $pop74
-; NO-SIMD128-NEXT:    i32.select $push77=, $3, $19, $pop76
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop77
-; NO-SIMD128-NEXT:    i32.extend8_s $push79=, $2
-; NO-SIMD128-NEXT:    i32.extend8_s $push78=, $18
-; NO-SIMD128-NEXT:    i32.gt_s $push80=, $pop79, $pop78
-; NO-SIMD128-NEXT:    i32.select $push81=, $2, $18, $pop80
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop81
-; NO-SIMD128-NEXT:    i32.extend8_s $push83=, $1
-; NO-SIMD128-NEXT:    i32.extend8_s $push82=, $17
-; NO-SIMD128-NEXT:    i32.gt_s $push84=, $pop83, $pop82
-; NO-SIMD128-NEXT:    i32.select $push85=, $1, $17, $pop84
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop85
+; NO-SIMD128-NEXT:    i32.select $push55=, $3, $19, $pop54
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop55
+; NO-SIMD128-NEXT:    i32.extend8_s $push57=, $2
+; NO-SIMD128-NEXT:    i32.extend8_s $push56=, $18
+; NO-SIMD128-NEXT:    i32.gt_s $push58=, $pop57, $pop56
+; NO-SIMD128-NEXT:    i32.select $push59=, $2, $18, $pop58
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop59
+; NO-SIMD128-NEXT:    i32.extend8_s $push61=, $1
+; NO-SIMD128-NEXT:    i32.extend8_s $push60=, $17
+; NO-SIMD128-NEXT:    i32.gt_s $push62=, $pop61, $pop60
+; NO-SIMD128-NEXT:    i32.select $push63=, $1, $17, $pop62
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop63
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_s_v16i8:
@@ -1207,93 +965,71 @@ define <16 x i8> @max_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $3, $19, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $4
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $20
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $4, $20, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $5
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $21
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.select $push21=, $5, $21, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $6
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push22=, $22
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push24=, $pop23, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.select $push25=, $6, $22, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $7
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push17=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $21
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.select $push19=, $5, $21, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push20=, $22
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push22=, $pop21, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.select $push23=, $6, $22, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push24=, $23
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push26=, $pop25, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.select $push27=, $7, $23, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $24
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $7, $23, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push35=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push34=, $24
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push36=, $pop35, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.select $push37=, $8, $24, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop39), $pop37
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $9
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $25
+; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $8, $24, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push33=, $9
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push32=, $25
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push34=, $pop33, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.select $push35=, $9, $25, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push37=, $10
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push36=, $26
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push38=, $pop37, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.select $push39=, $10, $26, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop39
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $11
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $27
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push42=, $pop41, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.select $push43=, $9, $25, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop43
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $0, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $10
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $26
+; NO-SIMD128-FAST-NEXT:    i32.select $push43=, $11, $27, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop43
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $28
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push46=, $pop45, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.select $push47=, $10, $26, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop49), $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $0, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push51=, $11
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push50=, $27
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push52=, $pop51, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.select $push53=, $11, $27, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop55), $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $0, $pop60
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push57=, $12
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push56=, $28
+; NO-SIMD128-FAST-NEXT:    i32.select $push47=, $12, $28, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop47
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push49=, $13
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push48=, $29
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push50=, $pop49, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.select $push51=, $13, $29, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop51
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push53=, $14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push52=, $30
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push54=, $pop53, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.select $push55=, $14, $30, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop55
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push57=, $15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push56=, $31
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push58=, $pop57, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.select $push59=, $12, $28, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop61), $pop59
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $0, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push63=, $13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push62=, $29
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push64=, $pop63, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.select $push65=, $13, $29, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop67), $pop65
-; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push73=, $0, $pop72
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push69=, $14
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push68=, $30
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push70=, $pop69, $pop68
-; NO-SIMD128-FAST-NEXT:    i32.select $push71=, $14, $30, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop73), $pop71
-; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $0, $pop78
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push75=, $15
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push74=, $31
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push76=, $pop75, $pop74
-; NO-SIMD128-FAST-NEXT:    i32.select $push77=, $15, $31, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop79), $pop77
-; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push85=, $0, $pop84
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push81=, $16
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push80=, $32
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push82=, $pop81, $pop80
-; NO-SIMD128-FAST-NEXT:    i32.select $push83=, $16, $32, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop85), $pop83
+; NO-SIMD128-FAST-NEXT:    i32.select $push59=, $15, $31, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop59
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push61=, $16
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push60=, $32
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push62=, $pop61, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.select $push63=, $16, $32, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop63
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp sgt <16 x i8> %x, %y
   %a = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
@@ -1316,140 +1052,118 @@ define <16 x i8> @max_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: max_u_v16i8:
 ; NO-SIMD128:         .functype max_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 15
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop0
-; NO-SIMD128-NEXT:    i32.const $push117=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop117
+; NO-SIMD128-NEXT:    i32.const $push95=, 255
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop95
 ; NO-SIMD128-NEXT:    i32.gt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.select $push4=, $16, $32, $pop3
-; NO-SIMD128-NEXT:    i32.store8 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push116=, 255
-; NO-SIMD128-NEXT:    i32.and $push8=, $15, $pop116
-; NO-SIMD128-NEXT:    i32.const $push115=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $31, $pop115
-; NO-SIMD128-NEXT:    i32.gt_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.select $push10=, $15, $31, $pop9
-; NO-SIMD128-NEXT:    i32.store8 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 13
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push114=, 255
-; NO-SIMD128-NEXT:    i32.and $push14=, $14, $pop114
-; NO-SIMD128-NEXT:    i32.const $push113=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $30, $pop113
-; NO-SIMD128-NEXT:    i32.gt_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.select $push16=, $14, $30, $pop15
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push23=, 12
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.const $push112=, 255
-; NO-SIMD128-NEXT:    i32.and $push20=, $13, $pop112
-; NO-SIMD128-NEXT:    i32.const $push111=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $29, $pop111
-; NO-SIMD128-NEXT:    i32.gt_u $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.select $push22=, $13, $29, $pop21
-; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push110=, 255
-; NO-SIMD128-NEXT:    i32.and $push26=, $12, $pop110
-; NO-SIMD128-NEXT:    i32.const $push109=, 255
-; NO-SIMD128-NEXT:    i32.and $push25=, $28, $pop109
-; NO-SIMD128-NEXT:    i32.gt_u $push27=, $pop26, $pop25
-; NO-SIMD128-NEXT:    i32.select $push28=, $12, $28, $pop27
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push35=, 10
-; NO-SIMD128-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-NEXT:    i32.const $push108=, 255
-; NO-SIMD128-NEXT:    i32.and $push32=, $11, $pop108
-; NO-SIMD128-NEXT:    i32.const $push107=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $27, $pop107
-; NO-SIMD128-NEXT:    i32.gt_u $push33=, $pop32, $pop31
-; NO-SIMD128-NEXT:    i32.select $push34=, $11, $27, $pop33
-; NO-SIMD128-NEXT:    i32.store8 0($pop36), $pop34
-; NO-SIMD128-NEXT:    i32.const $push41=, 9
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.const $push106=, 255
-; NO-SIMD128-NEXT:    i32.and $push38=, $10, $pop106
-; NO-SIMD128-NEXT:    i32.const $push105=, 255
-; NO-SIMD128-NEXT:    i32.and $push37=, $26, $pop105
-; NO-SIMD128-NEXT:    i32.gt_u $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.select $push40=, $10, $26, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push104=, 255
-; NO-SIMD128-NEXT:    i32.and $push44=, $9, $pop104
-; NO-SIMD128-NEXT:    i32.const $push103=, 255
-; NO-SIMD128-NEXT:    i32.and $push43=, $25, $pop103
-; NO-SIMD128-NEXT:    i32.gt_u $push45=, $pop44, $pop43
-; NO-SIMD128-NEXT:    i32.select $push46=, $9, $25, $pop45
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop46
-; NO-SIMD128-NEXT:    i32.const $push51=, 7
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.const $push102=, 255
-; NO-SIMD128-NEXT:    i32.and $push48=, $8, $pop102
-; NO-SIMD128-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-NEXT:    i32.and $push47=, $24, $pop101
-; NO-SIMD128-NEXT:    i32.gt_u $push49=, $pop48, $pop47
-; NO-SIMD128-NEXT:    i32.select $push50=, $8, $24, $pop49
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.const $push57=, 6
-; NO-SIMD128-NEXT:    i32.add $push58=, $0, $pop57
-; NO-SIMD128-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-NEXT:    i32.and $push54=, $7, $pop100
-; NO-SIMD128-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-NEXT:    i32.and $push53=, $23, $pop99
-; NO-SIMD128-NEXT:    i32.gt_u $push55=, $pop54, $pop53
-; NO-SIMD128-NEXT:    i32.select $push56=, $7, $23, $pop55
-; NO-SIMD128-NEXT:    i32.store8 0($pop58), $pop56
-; NO-SIMD128-NEXT:    i32.const $push63=, 5
-; NO-SIMD128-NEXT:    i32.add $push64=, $0, $pop63
-; NO-SIMD128-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-NEXT:    i32.and $push60=, $6, $pop98
-; NO-SIMD128-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-NEXT:    i32.and $push59=, $22, $pop97
-; NO-SIMD128-NEXT:    i32.gt_u $push61=, $pop60, $pop59
-; NO-SIMD128-NEXT:    i32.select $push62=, $6, $22, $pop61
-; NO-SIMD128-NEXT:    i32.store8 0($pop64), $pop62
-; NO-SIMD128-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-NEXT:    i32.and $push66=, $5, $pop96
-; NO-SIMD128-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-NEXT:    i32.and $push65=, $21, $pop95
-; NO-SIMD128-NEXT:    i32.gt_u $push67=, $pop66, $pop65
-; NO-SIMD128-NEXT:    i32.select $push68=, $5, $21, $pop67
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop68
-; NO-SIMD128-NEXT:    i32.const $push73=, 3
-; NO-SIMD128-NEXT:    i32.add $push74=, $0, $pop73
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop4
 ; NO-SIMD128-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-NEXT:    i32.and $push70=, $4, $pop94
+; NO-SIMD128-NEXT:    i32.and $push6=, $15, $pop94
 ; NO-SIMD128-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-NEXT:    i32.and $push69=, $20, $pop93
-; NO-SIMD128-NEXT:    i32.gt_u $push71=, $pop70, $pop69
-; NO-SIMD128-NEXT:    i32.select $push72=, $4, $20, $pop71
-; NO-SIMD128-NEXT:    i32.store8 0($pop74), $pop72
+; NO-SIMD128-NEXT:    i32.and $push5=, $31, $pop93
+; NO-SIMD128-NEXT:    i32.gt_u $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.select $push8=, $15, $31, $pop7
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop8
 ; NO-SIMD128-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-NEXT:    i32.and $push76=, $3, $pop92
+; NO-SIMD128-NEXT:    i32.and $push10=, $14, $pop92
 ; NO-SIMD128-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-NEXT:    i32.and $push75=, $19, $pop91
-; NO-SIMD128-NEXT:    i32.gt_u $push77=, $pop76, $pop75
-; NO-SIMD128-NEXT:    i32.select $push78=, $3, $19, $pop77
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop78
+; NO-SIMD128-NEXT:    i32.and $push9=, $30, $pop91
+; NO-SIMD128-NEXT:    i32.gt_u $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.select $push12=, $14, $30, $pop11
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop12
 ; NO-SIMD128-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-NEXT:    i32.and $push80=, $2, $pop90
+; NO-SIMD128-NEXT:    i32.and $push14=, $13, $pop90
 ; NO-SIMD128-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-NEXT:    i32.and $push79=, $18, $pop89
-; NO-SIMD128-NEXT:    i32.gt_u $push81=, $pop80, $pop79
-; NO-SIMD128-NEXT:    i32.select $push82=, $2, $18, $pop81
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop82
+; NO-SIMD128-NEXT:    i32.and $push13=, $29, $pop89
+; NO-SIMD128-NEXT:    i32.gt_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.select $push16=, $13, $29, $pop15
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop16
 ; NO-SIMD128-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-NEXT:    i32.and $push84=, $1, $pop88
+; NO-SIMD128-NEXT:    i32.and $push18=, $12, $pop88
 ; NO-SIMD128-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-NEXT:    i32.and $push83=, $17, $pop87
-; NO-SIMD128-NEXT:    i32.gt_u $push85=, $pop84, $pop83
-; NO-SIMD128-NEXT:    i32.select $push86=, $1, $17, $pop85
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop86
+; NO-SIMD128-NEXT:    i32.and $push17=, $28, $pop87
+; NO-SIMD128-NEXT:    i32.gt_u $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.select $push20=, $12, $28, $pop19
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push86=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $11, $pop86
+; NO-SIMD128-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-NEXT:    i32.and $push21=, $27, $pop85
+; NO-SIMD128-NEXT:    i32.gt_u $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.select $push24=, $11, $27, $pop23
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-NEXT:    i32.and $push26=, $10, $pop84
+; NO-SIMD128-NEXT:    i32.const $push83=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $26, $pop83
+; NO-SIMD128-NEXT:    i32.gt_u $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.select $push28=, $10, $26, $pop27
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push82=, 255
+; NO-SIMD128-NEXT:    i32.and $push30=, $9, $pop82
+; NO-SIMD128-NEXT:    i32.const $push81=, 255
+; NO-SIMD128-NEXT:    i32.and $push29=, $25, $pop81
+; NO-SIMD128-NEXT:    i32.gt_u $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.select $push32=, $9, $25, $pop31
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop32
+; NO-SIMD128-NEXT:    i32.const $push80=, 255
+; NO-SIMD128-NEXT:    i32.and $push34=, $8, $pop80
+; NO-SIMD128-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-NEXT:    i32.and $push33=, $24, $pop79
+; NO-SIMD128-NEXT:    i32.gt_u $push35=, $pop34, $pop33
+; NO-SIMD128-NEXT:    i32.select $push36=, $8, $24, $pop35
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop36
+; NO-SIMD128-NEXT:    i32.const $push78=, 255
+; NO-SIMD128-NEXT:    i32.and $push38=, $7, $pop78
+; NO-SIMD128-NEXT:    i32.const $push77=, 255
+; NO-SIMD128-NEXT:    i32.and $push37=, $23, $pop77
+; NO-SIMD128-NEXT:    i32.gt_u $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.select $push40=, $7, $23, $pop39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop40
+; NO-SIMD128-NEXT:    i32.const $push76=, 255
+; NO-SIMD128-NEXT:    i32.and $push42=, $6, $pop76
+; NO-SIMD128-NEXT:    i32.const $push75=, 255
+; NO-SIMD128-NEXT:    i32.and $push41=, $22, $pop75
+; NO-SIMD128-NEXT:    i32.gt_u $push43=, $pop42, $pop41
+; NO-SIMD128-NEXT:    i32.select $push44=, $6, $22, $pop43
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop44
+; NO-SIMD128-NEXT:    i32.const $push74=, 255
+; NO-SIMD128-NEXT:    i32.and $push46=, $5, $pop74
+; NO-SIMD128-NEXT:    i32.const $push73=, 255
+; NO-SIMD128-NEXT:    i32.and $push45=, $21, $pop73
+; NO-SIMD128-NEXT:    i32.gt_u $push47=, $pop46, $pop45
+; NO-SIMD128-NEXT:    i32.select $push48=, $5, $21, $pop47
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop48
+; NO-SIMD128-NEXT:    i32.const $push72=, 255
+; NO-SIMD128-NEXT:    i32.and $push50=, $4, $pop72
+; NO-SIMD128-NEXT:    i32.const $push71=, 255
+; NO-SIMD128-NEXT:    i32.and $push49=, $20, $pop71
+; NO-SIMD128-NEXT:    i32.gt_u $push51=, $pop50, $pop49
+; NO-SIMD128-NEXT:    i32.select $push52=, $4, $20, $pop51
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop52
+; NO-SIMD128-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-NEXT:    i32.and $push54=, $3, $pop70
+; NO-SIMD128-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-NEXT:    i32.and $push53=, $19, $pop69
+; NO-SIMD128-NEXT:    i32.gt_u $push55=, $pop54, $pop53
+; NO-SIMD128-NEXT:    i32.select $push56=, $3, $19, $pop55
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop56
+; NO-SIMD128-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-NEXT:    i32.and $push58=, $2, $pop68
+; NO-SIMD128-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-NEXT:    i32.and $push57=, $18, $pop67
+; NO-SIMD128-NEXT:    i32.gt_u $push59=, $pop58, $pop57
+; NO-SIMD128-NEXT:    i32.select $push60=, $2, $18, $pop59
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop60
+; NO-SIMD128-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-NEXT:    i32.and $push62=, $1, $pop66
+; NO-SIMD128-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-NEXT:    i32.and $push61=, $17, $pop65
+; NO-SIMD128-NEXT:    i32.gt_u $push63=, $pop62, $pop61
+; NO-SIMD128-NEXT:    i32.select $push64=, $1, $17, $pop63
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop64
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_u_v16i8:
@@ -1457,138 +1171,116 @@ define <16 x i8> @max_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop117
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop95
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.select $push4=, $1, $17, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop116
-; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $18, $pop115
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $18, $pop93
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.select $push8=, $2, $18, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop114
-; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $19, $pop113
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $19, $pop91
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.select $push12=, $3, $19, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop112
-; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $20, $pop111
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $20, $pop89
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.select $push16=, $4, $20, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $5, $pop110
-; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $21, $pop109
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.select $push22=, $5, $21, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $6, $pop108
-; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $22, $pop107
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.select $push26=, $6, $22, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $7, $pop106
-; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $23, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $5, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $21, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.select $push20=, $5, $21, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $6, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $22, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.select $push24=, $6, $22, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $7, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $23, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.select $push28=, $7, $23, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $8, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $24, $pop81
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $7, $23, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $8, $pop104
-; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $24, $pop103
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.select $push38=, $8, $24, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $9, $pop102
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $25, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $8, $24, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $9, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $25, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push35=, $pop34, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.select $push36=, $9, $25, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $10, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $26, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.select $push40=, $10, $26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop40
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $11, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $27, $pop75
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push43=, $pop42, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.select $push44=, $9, $25, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $10, $pop100
-; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $26, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.select $push44=, $11, $27, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $12, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $28, $pop73
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push47=, $pop46, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.select $push48=, $10, $26, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $11, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $27, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push53=, $pop52, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.select $push54=, $11, $27, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $12, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $28, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.select $push48=, $12, $28, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $13, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $29, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push51=, $pop50, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.select $push52=, $13, $29, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push54=, $14, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $30, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push55=, $pop54, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.select $push56=, $14, $30, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop56
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $15, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $31, $pop67
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push59=, $pop58, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.select $push60=, $12, $28, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push68=, $0, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push64=, $13, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $29, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push65=, $pop64, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.select $push66=, $13, $29, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop68), $pop66
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push70=, $14, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push69=, $30, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push71=, $pop70, $pop69
-; NO-SIMD128-FAST-NEXT:    i32.select $push72=, $14, $30, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $0, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push76=, $15, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push75=, $31, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push77=, $pop76, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.select $push78=, $15, $31, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop80), $pop78
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push86=, $0, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push82=, $16, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push81=, $32, $pop87
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push83=, $pop82, $pop81
-; NO-SIMD128-FAST-NEXT:    i32.select $push84=, $16, $32, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop86), $pop84
+; NO-SIMD128-FAST-NEXT:    i32.select $push60=, $15, $31, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $16, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push61=, $32, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push63=, $pop62, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.select $push64=, $16, $32, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop64
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ugt <16 x i8> %x, %y
   %a = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
@@ -1611,156 +1303,134 @@ define <16 x i8> @avgr_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: avgr_u_v16i8:
 ; NO-SIMD128:         .functype avgr_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.add $push2=, $16, $32
-; NO-SIMD128-NEXT:    i32.const $push3=, 1
-; NO-SIMD128-NEXT:    i32.add $push4=, $pop2, $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 254
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop4, $pop5
-; NO-SIMD128-NEXT:    i32.const $push133=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push7=, $pop6, $pop133
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $pop7
-; NO-SIMD128-NEXT:    i32.const $push8=, 14
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.add $push10=, $15, $31
-; NO-SIMD128-NEXT:    i32.const $push132=, 1
-; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop132
-; NO-SIMD128-NEXT:    i32.const $push131=, 254
-; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop131
-; NO-SIMD128-NEXT:    i32.const $push130=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop130
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $pop13
-; NO-SIMD128-NEXT:    i32.const $push14=, 13
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.add $push16=, $14, $30
-; NO-SIMD128-NEXT:    i32.const $push129=, 1
-; NO-SIMD128-NEXT:    i32.add $push17=, $pop16, $pop129
-; NO-SIMD128-NEXT:    i32.const $push128=, 254
-; NO-SIMD128-NEXT:    i32.and $push18=, $pop17, $pop128
-; NO-SIMD128-NEXT:    i32.const $push127=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push19=, $pop18, $pop127
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $pop19
-; NO-SIMD128-NEXT:    i32.const $push20=, 12
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.add $push22=, $13, $29
-; NO-SIMD128-NEXT:    i32.const $push126=, 1
-; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop126
-; NO-SIMD128-NEXT:    i32.const $push125=, 254
-; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop125
-; NO-SIMD128-NEXT:    i32.const $push124=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop124
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $pop25
-; NO-SIMD128-NEXT:    i32.const $push26=, 11
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-NEXT:    i32.add $push28=, $12, $28
-; NO-SIMD128-NEXT:    i32.const $push123=, 1
-; NO-SIMD128-NEXT:    i32.add $push29=, $pop28, $pop123
-; NO-SIMD128-NEXT:    i32.const $push122=, 254
-; NO-SIMD128-NEXT:    i32.and $push30=, $pop29, $pop122
-; NO-SIMD128-NEXT:    i32.const $push121=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push31=, $pop30, $pop121
-; NO-SIMD128-NEXT:    i32.store8 0($pop27), $pop31
-; NO-SIMD128-NEXT:    i32.const $push32=, 10
-; NO-SIMD128-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-NEXT:    i32.add $push34=, $11, $27
-; NO-SIMD128-NEXT:    i32.const $push120=, 1
-; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop120
-; NO-SIMD128-NEXT:    i32.const $push119=, 254
-; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop119
-; NO-SIMD128-NEXT:    i32.const $push118=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop118
-; NO-SIMD128-NEXT:    i32.store8 0($pop33), $pop37
-; NO-SIMD128-NEXT:    i32.const $push38=, 9
-; NO-SIMD128-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-NEXT:    i32.add $push40=, $10, $26
-; NO-SIMD128-NEXT:    i32.const $push117=, 1
-; NO-SIMD128-NEXT:    i32.add $push41=, $pop40, $pop117
-; NO-SIMD128-NEXT:    i32.const $push116=, 254
-; NO-SIMD128-NEXT:    i32.and $push42=, $pop41, $pop116
-; NO-SIMD128-NEXT:    i32.const $push115=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push43=, $pop42, $pop115
-; NO-SIMD128-NEXT:    i32.store8 0($pop39), $pop43
-; NO-SIMD128-NEXT:    i32.add $push44=, $9, $25
-; NO-SIMD128-NEXT:    i32.const $push114=, 1
-; NO-SIMD128-NEXT:    i32.add $push45=, $pop44, $pop114
-; NO-SIMD128-NEXT:    i32.const $push113=, 254
-; NO-SIMD128-NEXT:    i32.and $push46=, $pop45, $pop113
-; NO-SIMD128-NEXT:    i32.const $push112=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push47=, $pop46, $pop112
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop47
-; NO-SIMD128-NEXT:    i32.const $push48=, 7
-; NO-SIMD128-NEXT:    i32.add $push49=, $0, $pop48
-; NO-SIMD128-NEXT:    i32.add $push50=, $8, $24
+; NO-SIMD128-NEXT:    i32.add $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.const $push1=, 1
+; NO-SIMD128-NEXT:    i32.add $push2=, $pop0, $pop1
+; NO-SIMD128-NEXT:    i32.const $push3=, 254
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop2, $pop3
 ; NO-SIMD128-NEXT:    i32.const $push111=, 1
-; NO-SIMD128-NEXT:    i32.add $push51=, $pop50, $pop111
-; NO-SIMD128-NEXT:    i32.const $push110=, 254
-; NO-SIMD128-NEXT:    i32.and $push52=, $pop51, $pop110
-; NO-SIMD128-NEXT:    i32.const $push109=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop109
-; NO-SIMD128-NEXT:    i32.store8 0($pop49), $pop53
-; NO-SIMD128-NEXT:    i32.const $push54=, 6
-; NO-SIMD128-NEXT:    i32.add $push55=, $0, $pop54
-; NO-SIMD128-NEXT:    i32.add $push56=, $7, $23
+; NO-SIMD128-NEXT:    i32.shr_u $push5=, $pop4, $pop111
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $15, $31
+; NO-SIMD128-NEXT:    i32.const $push110=, 1
+; NO-SIMD128-NEXT:    i32.add $push7=, $pop6, $pop110
+; NO-SIMD128-NEXT:    i32.const $push109=, 254
+; NO-SIMD128-NEXT:    i32.and $push8=, $pop7, $pop109
 ; NO-SIMD128-NEXT:    i32.const $push108=, 1
-; NO-SIMD128-NEXT:    i32.add $push57=, $pop56, $pop108
-; NO-SIMD128-NEXT:    i32.const $push107=, 254
-; NO-SIMD128-NEXT:    i32.and $push58=, $pop57, $pop107
-; NO-SIMD128-NEXT:    i32.const $push106=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push59=, $pop58, $pop106
-; NO-SIMD128-NEXT:    i32.store8 0($pop55), $pop59
-; NO-SIMD128-NEXT:    i32.const $push60=, 5
-; NO-SIMD128-NEXT:    i32.add $push61=, $0, $pop60
-; NO-SIMD128-NEXT:    i32.add $push62=, $6, $22
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop108
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop9
+; NO-SIMD128-NEXT:    i32.add $push10=, $14, $30
+; NO-SIMD128-NEXT:    i32.const $push107=, 1
+; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop107
+; NO-SIMD128-NEXT:    i32.const $push106=, 254
+; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop106
 ; NO-SIMD128-NEXT:    i32.const $push105=, 1
-; NO-SIMD128-NEXT:    i32.add $push63=, $pop62, $pop105
-; NO-SIMD128-NEXT:    i32.const $push104=, 254
-; NO-SIMD128-NEXT:    i32.and $push64=, $pop63, $pop104
-; NO-SIMD128-NEXT:    i32.const $push103=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push65=, $pop64, $pop103
-; NO-SIMD128-NEXT:    i32.store8 0($pop61), $pop65
-; NO-SIMD128-NEXT:    i32.add $push66=, $5, $21
+; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop105
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-NEXT:    i32.add $push14=, $13, $29
+; NO-SIMD128-NEXT:    i32.const $push104=, 1
+; NO-SIMD128-NEXT:    i32.add $push15=, $pop14, $pop104
+; NO-SIMD128-NEXT:    i32.const $push103=, 254
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $pop103
 ; NO-SIMD128-NEXT:    i32.const $push102=, 1
-; NO-SIMD128-NEXT:    i32.add $push67=, $pop66, $pop102
-; NO-SIMD128-NEXT:    i32.const $push101=, 254
-; NO-SIMD128-NEXT:    i32.and $push68=, $pop67, $pop101
-; NO-SIMD128-NEXT:    i32.const $push100=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push69=, $pop68, $pop100
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop69
-; NO-SIMD128-NEXT:    i32.const $push70=, 3
-; NO-SIMD128-NEXT:    i32.add $push71=, $0, $pop70
-; NO-SIMD128-NEXT:    i32.add $push72=, $4, $20
+; NO-SIMD128-NEXT:    i32.shr_u $push17=, $pop16, $pop102
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop17
+; NO-SIMD128-NEXT:    i32.add $push18=, $12, $28
+; NO-SIMD128-NEXT:    i32.const $push101=, 1
+; NO-SIMD128-NEXT:    i32.add $push19=, $pop18, $pop101
+; NO-SIMD128-NEXT:    i32.const $push100=, 254
+; NO-SIMD128-NEXT:    i32.and $push20=, $pop19, $pop100
 ; NO-SIMD128-NEXT:    i32.const $push99=, 1
-; NO-SIMD128-NEXT:    i32.add $push73=, $pop72, $pop99
-; NO-SIMD128-NEXT:    i32.const $push98=, 254
-; NO-SIMD128-NEXT:    i32.and $push74=, $pop73, $pop98
-; NO-SIMD128-NEXT:    i32.const $push97=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push75=, $pop74, $pop97
-; NO-SIMD128-NEXT:    i32.store8 0($pop71), $pop75
-; NO-SIMD128-NEXT:    i32.add $push76=, $3, $19
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop99
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop21
+; NO-SIMD128-NEXT:    i32.add $push22=, $11, $27
+; NO-SIMD128-NEXT:    i32.const $push98=, 1
+; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop98
+; NO-SIMD128-NEXT:    i32.const $push97=, 254
+; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop97
 ; NO-SIMD128-NEXT:    i32.const $push96=, 1
-; NO-SIMD128-NEXT:    i32.add $push77=, $pop76, $pop96
-; NO-SIMD128-NEXT:    i32.const $push95=, 254
-; NO-SIMD128-NEXT:    i32.and $push78=, $pop77, $pop95
-; NO-SIMD128-NEXT:    i32.const $push94=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push79=, $pop78, $pop94
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop79
-; NO-SIMD128-NEXT:    i32.add $push80=, $2, $18
+; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop96
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop25
+; NO-SIMD128-NEXT:    i32.add $push26=, $10, $26
+; NO-SIMD128-NEXT:    i32.const $push95=, 1
+; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop95
+; NO-SIMD128-NEXT:    i32.const $push94=, 254
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop94
 ; NO-SIMD128-NEXT:    i32.const $push93=, 1
-; NO-SIMD128-NEXT:    i32.add $push81=, $pop80, $pop93
-; NO-SIMD128-NEXT:    i32.const $push92=, 254
-; NO-SIMD128-NEXT:    i32.and $push82=, $pop81, $pop92
-; NO-SIMD128-NEXT:    i32.const $push91=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push83=, $pop82, $pop91
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop83
-; NO-SIMD128-NEXT:    i32.add $push84=, $1, $17
+; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop93
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop29
+; NO-SIMD128-NEXT:    i32.add $push30=, $9, $25
+; NO-SIMD128-NEXT:    i32.const $push92=, 1
+; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop92
+; NO-SIMD128-NEXT:    i32.const $push91=, 254
+; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop91
 ; NO-SIMD128-NEXT:    i32.const $push90=, 1
-; NO-SIMD128-NEXT:    i32.add $push85=, $pop84, $pop90
-; NO-SIMD128-NEXT:    i32.const $push89=, 254
-; NO-SIMD128-NEXT:    i32.and $push86=, $pop85, $pop89
-; NO-SIMD128-NEXT:    i32.const $push88=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push87=, $pop86, $pop88
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop87
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop90
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop33
+; NO-SIMD128-NEXT:    i32.add $push34=, $8, $24
+; NO-SIMD128-NEXT:    i32.const $push89=, 1
+; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop89
+; NO-SIMD128-NEXT:    i32.const $push88=, 254
+; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop88
+; NO-SIMD128-NEXT:    i32.const $push87=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop87
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop37
+; NO-SIMD128-NEXT:    i32.add $push38=, $7, $23
+; NO-SIMD128-NEXT:    i32.const $push86=, 1
+; NO-SIMD128-NEXT:    i32.add $push39=, $pop38, $pop86
+; NO-SIMD128-NEXT:    i32.const $push85=, 254
+; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $pop85
+; NO-SIMD128-NEXT:    i32.const $push84=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push41=, $pop40, $pop84
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop41
+; NO-SIMD128-NEXT:    i32.add $push42=, $6, $22
+; NO-SIMD128-NEXT:    i32.const $push83=, 1
+; NO-SIMD128-NEXT:    i32.add $push43=, $pop42, $pop83
+; NO-SIMD128-NEXT:    i32.const $push82=, 254
+; NO-SIMD128-NEXT:    i32.and $push44=, $pop43, $pop82
+; NO-SIMD128-NEXT:    i32.const $push81=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push45=, $pop44, $pop81
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop45
+; NO-SIMD128-NEXT:    i32.add $push46=, $5, $21
+; NO-SIMD128-NEXT:    i32.const $push80=, 1
+; NO-SIMD128-NEXT:    i32.add $push47=, $pop46, $pop80
+; NO-SIMD128-NEXT:    i32.const $push79=, 254
+; NO-SIMD128-NEXT:    i32.and $push48=, $pop47, $pop79
+; NO-SIMD128-NEXT:    i32.const $push78=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push49=, $pop48, $pop78
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop49
+; NO-SIMD128-NEXT:    i32.add $push50=, $4, $20
+; NO-SIMD128-NEXT:    i32.const $push77=, 1
+; NO-SIMD128-NEXT:    i32.add $push51=, $pop50, $pop77
+; NO-SIMD128-NEXT:    i32.const $push76=, 254
+; NO-SIMD128-NEXT:    i32.and $push52=, $pop51, $pop76
+; NO-SIMD128-NEXT:    i32.const $push75=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop75
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop53
+; NO-SIMD128-NEXT:    i32.add $push54=, $3, $19
+; NO-SIMD128-NEXT:    i32.const $push74=, 1
+; NO-SIMD128-NEXT:    i32.add $push55=, $pop54, $pop74
+; NO-SIMD128-NEXT:    i32.const $push73=, 254
+; NO-SIMD128-NEXT:    i32.and $push56=, $pop55, $pop73
+; NO-SIMD128-NEXT:    i32.const $push72=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push57=, $pop56, $pop72
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop57
+; NO-SIMD128-NEXT:    i32.add $push58=, $2, $18
+; NO-SIMD128-NEXT:    i32.const $push71=, 1
+; NO-SIMD128-NEXT:    i32.add $push59=, $pop58, $pop71
+; NO-SIMD128-NEXT:    i32.const $push70=, 254
+; NO-SIMD128-NEXT:    i32.and $push60=, $pop59, $pop70
+; NO-SIMD128-NEXT:    i32.const $push69=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push61=, $pop60, $pop69
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop61
+; NO-SIMD128-NEXT:    i32.add $push62=, $1, $17
+; NO-SIMD128-NEXT:    i32.const $push68=, 1
+; NO-SIMD128-NEXT:    i32.add $push63=, $pop62, $pop68
+; NO-SIMD128-NEXT:    i32.const $push67=, 254
+; NO-SIMD128-NEXT:    i32.and $push64=, $pop63, $pop67
+; NO-SIMD128-NEXT:    i32.const $push66=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push65=, $pop64, $pop66
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop65
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: avgr_u_v16i8:
@@ -1771,151 +1441,129 @@ define <16 x i8> @avgr_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $pop0, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 254
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop2, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push133=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop133
+; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop111
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $2, $18
-; NO-SIMD128-FAST-NEXT:    i32.const $push132=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop132
-; NO-SIMD128-FAST-NEXT:    i32.const $push131=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop131
-; NO-SIMD128-FAST-NEXT:    i32.const $push130=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop130
+; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop110
+; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop109
+; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop108
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $3, $19
-; NO-SIMD128-FAST-NEXT:    i32.const $push129=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop129
-; NO-SIMD128-FAST-NEXT:    i32.const $push128=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop128
-; NO-SIMD128-FAST-NEXT:    i32.const $push127=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop127
-; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.const $push126=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $pop16, $pop126
-; NO-SIMD128-FAST-NEXT:    i32.const $push125=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop125
-; NO-SIMD128-FAST-NEXT:    i32.const $push124=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push19=, $pop18, $pop124
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.const $push123=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $pop20, $pop123
-; NO-SIMD128-FAST-NEXT:    i32.const $push122=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $pop122
-; NO-SIMD128-FAST-NEXT:    i32.const $push121=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push23=, $pop22, $pop121
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.const $push120=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop120
-; NO-SIMD128-FAST-NEXT:    i32.const $push119=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop119
-; NO-SIMD128-FAST-NEXT:    i32.const $push118=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop118
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $pop32, $pop117
-; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $pop116
-; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop115
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop114
-; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop113
-; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop112
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop41
-; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push43=, $pop42, $pop111
-; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push44=, $pop43, $pop110
-; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop109
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop45
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $0, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $pop48, $pop108
-; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $pop49, $pop107
-; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push51=, $pop50, $pop106
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop47), $pop51
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push53=, $0, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop107
+; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop106
 ; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $pop54, $pop105
-; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $pop55, $pop104
-; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push57=, $pop56, $pop103
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop53), $pop57
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $0, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.add $push60=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $pop14, $pop104
+; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $pop103
 ; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $pop60, $pop102
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $pop61, $pop101
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push63=, $pop62, $pop100
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop59), $pop63
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push65=, $0, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.add $push66=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop102
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $pop18, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $pop100
 ; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $pop66, $pop99
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $pop67, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push69=, $pop68, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop65), $pop69
-; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push71=, $0, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.add $push72=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $pop22, $pop98
+; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $pop23, $pop97
 ; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push73=, $pop72, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push74=, $pop73, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push75=, $pop74, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop71), $pop75
-; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push77=, $0, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.add $push78=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop96
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop94
 ; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $pop78, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push80=, $pop79, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push81=, $pop80, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop77), $pop81
-; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push83=, $0, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.add $push84=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop93
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $pop30, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $pop31, $pop91
 ; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push85=, $pop84, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push86=, $pop85, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push87=, $pop86, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop83), $pop87
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push33=, $pop32, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop33
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push35=, $pop34, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $pop35, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push37=, $pop36, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push43=, $pop42, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push44=, $pop43, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop81
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.add $push46=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $pop46, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push48=, $pop47, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push49=, $pop48, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop49
+; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push51=, $pop50, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $pop51, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push53=, $pop52, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop53
+; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $pop54, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $pop55, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push57=, $pop56, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop57
+; NO-SIMD128-FAST-NEXT:    i32.add $push58=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $pop58, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push60=, $pop59, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push61=, $pop60, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop61
+; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push63=, $pop62, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push64=, $pop63, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push65=, $pop64, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop65
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add nuw <16 x i8> %x, %y
   %b = add nuw <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
@@ -1949,156 +1597,134 @@ define <16 x i8> @avgr_u_v16i8_wrap(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: avgr_u_v16i8_wrap:
 ; NO-SIMD128:         .functype avgr_u_v16i8_wrap (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.add $push2=, $16, $32
-; NO-SIMD128-NEXT:    i32.const $push3=, 1
-; NO-SIMD128-NEXT:    i32.add $push4=, $pop2, $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 254
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop4, $pop5
-; NO-SIMD128-NEXT:    i32.const $push133=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push7=, $pop6, $pop133
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $pop7
-; NO-SIMD128-NEXT:    i32.const $push8=, 14
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.add $push10=, $15, $31
-; NO-SIMD128-NEXT:    i32.const $push132=, 1
-; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop132
-; NO-SIMD128-NEXT:    i32.const $push131=, 254
-; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop131
-; NO-SIMD128-NEXT:    i32.const $push130=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop130
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $pop13
-; NO-SIMD128-NEXT:    i32.const $push14=, 13
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.add $push16=, $14, $30
-; NO-SIMD128-NEXT:    i32.const $push129=, 1
-; NO-SIMD128-NEXT:    i32.add $push17=, $pop16, $pop129
-; NO-SIMD128-NEXT:    i32.const $push128=, 254
-; NO-SIMD128-NEXT:    i32.and $push18=, $pop17, $pop128
-; NO-SIMD128-NEXT:    i32.const $push127=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push19=, $pop18, $pop127
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $pop19
-; NO-SIMD128-NEXT:    i32.const $push20=, 12
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.add $push22=, $13, $29
-; NO-SIMD128-NEXT:    i32.const $push126=, 1
-; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop126
-; NO-SIMD128-NEXT:    i32.const $push125=, 254
-; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop125
-; NO-SIMD128-NEXT:    i32.const $push124=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop124
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $pop25
-; NO-SIMD128-NEXT:    i32.const $push26=, 11
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-NEXT:    i32.add $push28=, $12, $28
-; NO-SIMD128-NEXT:    i32.const $push123=, 1
-; NO-SIMD128-NEXT:    i32.add $push29=, $pop28, $pop123
-; NO-SIMD128-NEXT:    i32.const $push122=, 254
-; NO-SIMD128-NEXT:    i32.and $push30=, $pop29, $pop122
-; NO-SIMD128-NEXT:    i32.const $push121=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push31=, $pop30, $pop121
-; NO-SIMD128-NEXT:    i32.store8 0($pop27), $pop31
-; NO-SIMD128-NEXT:    i32.const $push32=, 10
-; NO-SIMD128-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-NEXT:    i32.add $push34=, $11, $27
-; NO-SIMD128-NEXT:    i32.const $push120=, 1
-; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop120
-; NO-SIMD128-NEXT:    i32.const $push119=, 254
-; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop119
-; NO-SIMD128-NEXT:    i32.const $push118=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop118
-; NO-SIMD128-NEXT:    i32.store8 0($pop33), $pop37
-; NO-SIMD128-NEXT:    i32.const $push38=, 9
-; NO-SIMD128-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-NEXT:    i32.add $push40=, $10, $26
-; NO-SIMD128-NEXT:    i32.const $push117=, 1
-; NO-SIMD128-NEXT:    i32.add $push41=, $pop40, $pop117
-; NO-SIMD128-NEXT:    i32.const $push116=, 254
-; NO-SIMD128-NEXT:    i32.and $push42=, $pop41, $pop116
-; NO-SIMD128-NEXT:    i32.const $push115=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push43=, $pop42, $pop115
-; NO-SIMD128-NEXT:    i32.store8 0($pop39), $pop43
-; NO-SIMD128-NEXT:    i32.add $push44=, $9, $25
-; NO-SIMD128-NEXT:    i32.const $push114=, 1
-; NO-SIMD128-NEXT:    i32.add $push45=, $pop44, $pop114
-; NO-SIMD128-NEXT:    i32.const $push113=, 254
-; NO-SIMD128-NEXT:    i32.and $push46=, $pop45, $pop113
-; NO-SIMD128-NEXT:    i32.const $push112=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push47=, $pop46, $pop112
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop47
-; NO-SIMD128-NEXT:    i32.const $push48=, 7
-; NO-SIMD128-NEXT:    i32.add $push49=, $0, $pop48
-; NO-SIMD128-NEXT:    i32.add $push50=, $8, $24
+; NO-SIMD128-NEXT:    i32.add $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.const $push1=, 1
+; NO-SIMD128-NEXT:    i32.add $push2=, $pop0, $pop1
+; NO-SIMD128-NEXT:    i32.const $push3=, 254
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop2, $pop3
 ; NO-SIMD128-NEXT:    i32.const $push111=, 1
-; NO-SIMD128-NEXT:    i32.add $push51=, $pop50, $pop111
-; NO-SIMD128-NEXT:    i32.const $push110=, 254
-; NO-SIMD128-NEXT:    i32.and $push52=, $pop51, $pop110
-; NO-SIMD128-NEXT:    i32.const $push109=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop109
-; NO-SIMD128-NEXT:    i32.store8 0($pop49), $pop53
-; NO-SIMD128-NEXT:    i32.const $push54=, 6
-; NO-SIMD128-NEXT:    i32.add $push55=, $0, $pop54
-; NO-SIMD128-NEXT:    i32.add $push56=, $7, $23
+; NO-SIMD128-NEXT:    i32.shr_u $push5=, $pop4, $pop111
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $15, $31
+; NO-SIMD128-NEXT:    i32.const $push110=, 1
+; NO-SIMD128-NEXT:    i32.add $push7=, $pop6, $pop110
+; NO-SIMD128-NEXT:    i32.const $push109=, 254
+; NO-SIMD128-NEXT:    i32.and $push8=, $pop7, $pop109
 ; NO-SIMD128-NEXT:    i32.const $push108=, 1
-; NO-SIMD128-NEXT:    i32.add $push57=, $pop56, $pop108
-; NO-SIMD128-NEXT:    i32.const $push107=, 254
-; NO-SIMD128-NEXT:    i32.and $push58=, $pop57, $pop107
-; NO-SIMD128-NEXT:    i32.const $push106=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push59=, $pop58, $pop106
-; NO-SIMD128-NEXT:    i32.store8 0($pop55), $pop59
-; NO-SIMD128-NEXT:    i32.const $push60=, 5
-; NO-SIMD128-NEXT:    i32.add $push61=, $0, $pop60
-; NO-SIMD128-NEXT:    i32.add $push62=, $6, $22
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop108
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop9
+; NO-SIMD128-NEXT:    i32.add $push10=, $14, $30
+; NO-SIMD128-NEXT:    i32.const $push107=, 1
+; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop107
+; NO-SIMD128-NEXT:    i32.const $push106=, 254
+; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop106
 ; NO-SIMD128-NEXT:    i32.const $push105=, 1
-; NO-SIMD128-NEXT:    i32.add $push63=, $pop62, $pop105
-; NO-SIMD128-NEXT:    i32.const $push104=, 254
-; NO-SIMD128-NEXT:    i32.and $push64=, $pop63, $pop104
-; NO-SIMD128-NEXT:    i32.const $push103=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push65=, $pop64, $pop103
-; NO-SIMD128-NEXT:    i32.store8 0($pop61), $pop65
-; NO-SIMD128-NEXT:    i32.add $push66=, $5, $21
+; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop105
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-NEXT:    i32.add $push14=, $13, $29
+; NO-SIMD128-NEXT:    i32.const $push104=, 1
+; NO-SIMD128-NEXT:    i32.add $push15=, $pop14, $pop104
+; NO-SIMD128-NEXT:    i32.const $push103=, 254
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $pop103
 ; NO-SIMD128-NEXT:    i32.const $push102=, 1
-; NO-SIMD128-NEXT:    i32.add $push67=, $pop66, $pop102
-; NO-SIMD128-NEXT:    i32.const $push101=, 254
-; NO-SIMD128-NEXT:    i32.and $push68=, $pop67, $pop101
-; NO-SIMD128-NEXT:    i32.const $push100=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push69=, $pop68, $pop100
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop69
-; NO-SIMD128-NEXT:    i32.const $push70=, 3
-; NO-SIMD128-NEXT:    i32.add $push71=, $0, $pop70
-; NO-SIMD128-NEXT:    i32.add $push72=, $4, $20
+; NO-SIMD128-NEXT:    i32.shr_u $push17=, $pop16, $pop102
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop17
+; NO-SIMD128-NEXT:    i32.add $push18=, $12, $28
+; NO-SIMD128-NEXT:    i32.const $push101=, 1
+; NO-SIMD128-NEXT:    i32.add $push19=, $pop18, $pop101
+; NO-SIMD128-NEXT:    i32.const $push100=, 254
+; NO-SIMD128-NEXT:    i32.and $push20=, $pop19, $pop100
 ; NO-SIMD128-NEXT:    i32.const $push99=, 1
-; NO-SIMD128-NEXT:    i32.add $push73=, $pop72, $pop99
-; NO-SIMD128-NEXT:    i32.const $push98=, 254
-; NO-SIMD128-NEXT:    i32.and $push74=, $pop73, $pop98
-; NO-SIMD128-NEXT:    i32.const $push97=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push75=, $pop74, $pop97
-; NO-SIMD128-NEXT:    i32.store8 0($pop71), $pop75
-; NO-SIMD128-NEXT:    i32.add $push76=, $3, $19
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop99
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop21
+; NO-SIMD128-NEXT:    i32.add $push22=, $11, $27
+; NO-SIMD128-NEXT:    i32.const $push98=, 1
+; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop98
+; NO-SIMD128-NEXT:    i32.const $push97=, 254
+; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop97
 ; NO-SIMD128-NEXT:    i32.const $push96=, 1
-; NO-SIMD128-NEXT:    i32.add $push77=, $pop76, $pop96
-; NO-SIMD128-NEXT:    i32.const $push95=, 254
-; NO-SIMD128-NEXT:    i32.and $push78=, $pop77, $pop95
-; NO-SIMD128-NEXT:    i32.const $push94=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push79=, $pop78, $pop94
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop79
-; NO-SIMD128-NEXT:    i32.add $push80=, $2, $18
+; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop96
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop25
+; NO-SIMD128-NEXT:    i32.add $push26=, $10, $26
+; NO-SIMD128-NEXT:    i32.const $push95=, 1
+; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop95
+; NO-SIMD128-NEXT:    i32.const $push94=, 254
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop94
 ; NO-SIMD128-NEXT:    i32.const $push93=, 1
-; NO-SIMD128-NEXT:    i32.add $push81=, $pop80, $pop93
-; NO-SIMD128-NEXT:    i32.const $push92=, 254
-; NO-SIMD128-NEXT:    i32.and $push82=, $pop81, $pop92
-; NO-SIMD128-NEXT:    i32.const $push91=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push83=, $pop82, $pop91
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop83
-; NO-SIMD128-NEXT:    i32.add $push84=, $1, $17
+; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop93
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop29
+; NO-SIMD128-NEXT:    i32.add $push30=, $9, $25
+; NO-SIMD128-NEXT:    i32.const $push92=, 1
+; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop92
+; NO-SIMD128-NEXT:    i32.const $push91=, 254
+; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop91
 ; NO-SIMD128-NEXT:    i32.const $push90=, 1
-; NO-SIMD128-NEXT:    i32.add $push85=, $pop84, $pop90
-; NO-SIMD128-NEXT:    i32.const $push89=, 254
-; NO-SIMD128-NEXT:    i32.and $push86=, $pop85, $pop89
-; NO-SIMD128-NEXT:    i32.const $push88=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push87=, $pop86, $pop88
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop87
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop90
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop33
+; NO-SIMD128-NEXT:    i32.add $push34=, $8, $24
+; NO-SIMD128-NEXT:    i32.const $push89=, 1
+; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop89
+; NO-SIMD128-NEXT:    i32.const $push88=, 254
+; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop88
+; NO-SIMD128-NEXT:    i32.const $push87=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop87
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop37
+; NO-SIMD128-NEXT:    i32.add $push38=, $7, $23
+; NO-SIMD128-NEXT:    i32.const $push86=, 1
+; NO-SIMD128-NEXT:    i32.add $push39=, $pop38, $pop86
+; NO-SIMD128-NEXT:    i32.const $push85=, 254
+; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $pop85
+; NO-SIMD128-NEXT:    i32.const $push84=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push41=, $pop40, $pop84
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop41
+; NO-SIMD128-NEXT:    i32.add $push42=, $6, $22
+; NO-SIMD128-NEXT:    i32.const $push83=, 1
+; NO-SIMD128-NEXT:    i32.add $push43=, $pop42, $pop83
+; NO-SIMD128-NEXT:    i32.const $push82=, 254
+; NO-SIMD128-NEXT:    i32.and $push44=, $pop43, $pop82
+; NO-SIMD128-NEXT:    i32.const $push81=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push45=, $pop44, $pop81
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop45
+; NO-SIMD128-NEXT:    i32.add $push46=, $5, $21
+; NO-SIMD128-NEXT:    i32.const $push80=, 1
+; NO-SIMD128-NEXT:    i32.add $push47=, $pop46, $pop80
+; NO-SIMD128-NEXT:    i32.const $push79=, 254
+; NO-SIMD128-NEXT:    i32.and $push48=, $pop47, $pop79
+; NO-SIMD128-NEXT:    i32.const $push78=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push49=, $pop48, $pop78
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop49
+; NO-SIMD128-NEXT:    i32.add $push50=, $4, $20
+; NO-SIMD128-NEXT:    i32.const $push77=, 1
+; NO-SIMD128-NEXT:    i32.add $push51=, $pop50, $pop77
+; NO-SIMD128-NEXT:    i32.const $push76=, 254
+; NO-SIMD128-NEXT:    i32.and $push52=, $pop51, $pop76
+; NO-SIMD128-NEXT:    i32.const $push75=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop75
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop53
+; NO-SIMD128-NEXT:    i32.add $push54=, $3, $19
+; NO-SIMD128-NEXT:    i32.const $push74=, 1
+; NO-SIMD128-NEXT:    i32.add $push55=, $pop54, $pop74
+; NO-SIMD128-NEXT:    i32.const $push73=, 254
+; NO-SIMD128-NEXT:    i32.and $push56=, $pop55, $pop73
+; NO-SIMD128-NEXT:    i32.const $push72=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push57=, $pop56, $pop72
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop57
+; NO-SIMD128-NEXT:    i32.add $push58=, $2, $18
+; NO-SIMD128-NEXT:    i32.const $push71=, 1
+; NO-SIMD128-NEXT:    i32.add $push59=, $pop58, $pop71
+; NO-SIMD128-NEXT:    i32.const $push70=, 254
+; NO-SIMD128-NEXT:    i32.and $push60=, $pop59, $pop70
+; NO-SIMD128-NEXT:    i32.const $push69=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push61=, $pop60, $pop69
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop61
+; NO-SIMD128-NEXT:    i32.add $push62=, $1, $17
+; NO-SIMD128-NEXT:    i32.const $push68=, 1
+; NO-SIMD128-NEXT:    i32.add $push63=, $pop62, $pop68
+; NO-SIMD128-NEXT:    i32.const $push67=, 254
+; NO-SIMD128-NEXT:    i32.and $push64=, $pop63, $pop67
+; NO-SIMD128-NEXT:    i32.const $push66=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push65=, $pop64, $pop66
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop65
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: avgr_u_v16i8_wrap:
@@ -2109,151 +1735,129 @@ define <16 x i8> @avgr_u_v16i8_wrap(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $pop0, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 254
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop2, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push133=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop133
+; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop111
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $2, $18
-; NO-SIMD128-FAST-NEXT:    i32.const $push132=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop132
-; NO-SIMD128-FAST-NEXT:    i32.const $push131=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop131
-; NO-SIMD128-FAST-NEXT:    i32.const $push130=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop130
+; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop110
+; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop109
+; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop108
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $3, $19
-; NO-SIMD128-FAST-NEXT:    i32.const $push129=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop129
-; NO-SIMD128-FAST-NEXT:    i32.const $push128=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop128
-; NO-SIMD128-FAST-NEXT:    i32.const $push127=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop127
-; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.const $push126=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $pop16, $pop126
-; NO-SIMD128-FAST-NEXT:    i32.const $push125=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop125
-; NO-SIMD128-FAST-NEXT:    i32.const $push124=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push19=, $pop18, $pop124
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.const $push123=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $pop20, $pop123
-; NO-SIMD128-FAST-NEXT:    i32.const $push122=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $pop122
-; NO-SIMD128-FAST-NEXT:    i32.const $push121=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push23=, $pop22, $pop121
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.const $push120=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop120
-; NO-SIMD128-FAST-NEXT:    i32.const $push119=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop119
-; NO-SIMD128-FAST-NEXT:    i32.const $push118=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop118
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $pop32, $pop117
-; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $pop116
-; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop115
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop114
-; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop113
-; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop112
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop41
-; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push43=, $pop42, $pop111
-; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push44=, $pop43, $pop110
-; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop109
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop45
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $0, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $pop48, $pop108
-; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $pop49, $pop107
-; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push51=, $pop50, $pop106
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop47), $pop51
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push53=, $0, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop107
+; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop106
 ; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $pop54, $pop105
-; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $pop55, $pop104
-; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push57=, $pop56, $pop103
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop53), $pop57
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $0, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.add $push60=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $pop14, $pop104
+; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $pop103
 ; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $pop60, $pop102
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $pop61, $pop101
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push63=, $pop62, $pop100
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop59), $pop63
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push65=, $0, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.add $push66=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop102
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $pop18, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $pop100
 ; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $pop66, $pop99
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $pop67, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push69=, $pop68, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop65), $pop69
-; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push71=, $0, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.add $push72=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $pop22, $pop98
+; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $pop23, $pop97
 ; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push73=, $pop72, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push74=, $pop73, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push75=, $pop74, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop71), $pop75
-; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push77=, $0, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.add $push78=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop96
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop94
 ; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $pop78, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push80=, $pop79, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push81=, $pop80, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop77), $pop81
-; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push83=, $0, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.add $push84=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop93
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $pop30, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $pop31, $pop91
 ; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push85=, $pop84, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push86=, $pop85, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push87=, $pop86, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop83), $pop87
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push33=, $pop32, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop33
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push35=, $pop34, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $pop35, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push37=, $pop36, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push43=, $pop42, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push44=, $pop43, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop81
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.add $push46=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $pop46, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push48=, $pop47, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push49=, $pop48, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop49
+; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push51=, $pop50, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $pop51, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push53=, $pop52, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop53
+; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $pop54, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $pop55, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push57=, $pop56, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop57
+; NO-SIMD128-FAST-NEXT:    i32.add $push58=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $pop58, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push60=, $pop59, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push61=, $pop60, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop61
+; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push63=, $pop62, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push64=, $pop63, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push65=, $pop64, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop65
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add <16 x i8> %x, %y
   %b = add <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
@@ -2279,140 +1883,118 @@ define <16 x i8> @abs_v16i8(<16 x i8> %x) {
 ; NO-SIMD128-LABEL: abs_v16i8:
 ; NO-SIMD128:         .functype abs_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 15
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $16
 ; NO-SIMD128-NEXT:    i32.const $push1=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push117=, $pop0, $pop1
-; NO-SIMD128-NEXT:    local.tee $push116=, $17=, $pop117
-; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $pop116
+; NO-SIMD128-NEXT:    i32.shr_s $push95=, $pop0, $pop1
+; NO-SIMD128-NEXT:    local.tee $push94=, $17=, $pop95
+; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $pop94
 ; NO-SIMD128-NEXT:    i32.sub $push3=, $pop2, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $15
-; NO-SIMD128-NEXT:    i32.const $push115=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push114=, $pop6, $pop115
-; NO-SIMD128-NEXT:    local.tee $push113=, $16=, $pop114
-; NO-SIMD128-NEXT:    i32.xor $push7=, $15, $pop113
-; NO-SIMD128-NEXT:    i32.sub $push8=, $pop7, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push14=, 13
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $14
-; NO-SIMD128-NEXT:    i32.const $push112=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push111=, $pop11, $pop112
-; NO-SIMD128-NEXT:    local.tee $push110=, $16=, $pop111
-; NO-SIMD128-NEXT:    i32.xor $push12=, $14, $pop110
-; NO-SIMD128-NEXT:    i32.sub $push13=, $pop12, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $pop13
-; NO-SIMD128-NEXT:    i32.const $push19=, 12
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $13
-; NO-SIMD128-NEXT:    i32.const $push109=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push108=, $pop16, $pop109
-; NO-SIMD128-NEXT:    local.tee $push107=, $16=, $pop108
-; NO-SIMD128-NEXT:    i32.xor $push17=, $13, $pop107
-; NO-SIMD128-NEXT:    i32.sub $push18=, $pop17, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push24=, 11
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $12
-; NO-SIMD128-NEXT:    i32.const $push106=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push105=, $pop21, $pop106
-; NO-SIMD128-NEXT:    local.tee $push104=, $16=, $pop105
-; NO-SIMD128-NEXT:    i32.xor $push22=, $12, $pop104
-; NO-SIMD128-NEXT:    i32.sub $push23=, $pop22, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push29=, 10
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.extend8_s $push26=, $11
-; NO-SIMD128-NEXT:    i32.const $push103=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push102=, $pop26, $pop103
-; NO-SIMD128-NEXT:    local.tee $push101=, $16=, $pop102
-; NO-SIMD128-NEXT:    i32.xor $push27=, $11, $pop101
-; NO-SIMD128-NEXT:    i32.sub $push28=, $pop27, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push34=, 9
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $10
-; NO-SIMD128-NEXT:    i32.const $push100=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push99=, $pop31, $pop100
-; NO-SIMD128-NEXT:    local.tee $push98=, $16=, $pop99
-; NO-SIMD128-NEXT:    i32.xor $push32=, $10, $pop98
-; NO-SIMD128-NEXT:    i32.sub $push33=, $pop32, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $9
-; NO-SIMD128-NEXT:    i32.const $push97=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push96=, $pop36, $pop97
-; NO-SIMD128-NEXT:    local.tee $push95=, $16=, $pop96
-; NO-SIMD128-NEXT:    i32.xor $push37=, $9, $pop95
-; NO-SIMD128-NEXT:    i32.sub $push38=, $pop37, $16
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop38
-; NO-SIMD128-NEXT:    i32.const $push94=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop94
-; NO-SIMD128-NEXT:    i32.extend8_s $push39=, $8
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $15
 ; NO-SIMD128-NEXT:    i32.const $push93=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push92=, $pop39, $pop93
+; NO-SIMD128-NEXT:    i32.shr_s $push92=, $pop4, $pop93
 ; NO-SIMD128-NEXT:    local.tee $push91=, $16=, $pop92
-; NO-SIMD128-NEXT:    i32.xor $push40=, $8, $pop91
-; NO-SIMD128-NEXT:    i32.sub $push41=, $pop40, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop41
-; NO-SIMD128-NEXT:    i32.const $push46=, 6
-; NO-SIMD128-NEXT:    i32.add $push47=, $0, $pop46
-; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $7
+; NO-SIMD128-NEXT:    i32.xor $push5=, $15, $pop91
+; NO-SIMD128-NEXT:    i32.sub $push6=, $pop5, $16
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $14
 ; NO-SIMD128-NEXT:    i32.const $push90=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push89=, $pop43, $pop90
+; NO-SIMD128-NEXT:    i32.shr_s $push89=, $pop7, $pop90
 ; NO-SIMD128-NEXT:    local.tee $push88=, $16=, $pop89
-; NO-SIMD128-NEXT:    i32.xor $push44=, $7, $pop88
-; NO-SIMD128-NEXT:    i32.sub $push45=, $pop44, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop47), $pop45
-; NO-SIMD128-NEXT:    i32.const $push51=, 5
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.extend8_s $push48=, $6
+; NO-SIMD128-NEXT:    i32.xor $push8=, $14, $pop88
+; NO-SIMD128-NEXT:    i32.sub $push9=, $pop8, $16
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop9
+; NO-SIMD128-NEXT:    i32.extend8_s $push10=, $13
 ; NO-SIMD128-NEXT:    i32.const $push87=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push86=, $pop48, $pop87
+; NO-SIMD128-NEXT:    i32.shr_s $push86=, $pop10, $pop87
 ; NO-SIMD128-NEXT:    local.tee $push85=, $16=, $pop86
-; NO-SIMD128-NEXT:    i32.xor $push49=, $6, $pop85
-; NO-SIMD128-NEXT:    i32.sub $push50=, $pop49, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $5
+; NO-SIMD128-NEXT:    i32.xor $push11=, $13, $pop85
+; NO-SIMD128-NEXT:    i32.sub $push12=, $pop11, $16
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $12
 ; NO-SIMD128-NEXT:    i32.const $push84=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push83=, $pop53, $pop84
+; NO-SIMD128-NEXT:    i32.shr_s $push83=, $pop13, $pop84
 ; NO-SIMD128-NEXT:    local.tee $push82=, $16=, $pop83
-; NO-SIMD128-NEXT:    i32.xor $push54=, $5, $pop82
-; NO-SIMD128-NEXT:    i32.sub $push55=, $pop54, $16
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop55
-; NO-SIMD128-NEXT:    i32.const $push59=, 3
-; NO-SIMD128-NEXT:    i32.add $push60=, $0, $pop59
-; NO-SIMD128-NEXT:    i32.extend8_s $push56=, $4
+; NO-SIMD128-NEXT:    i32.xor $push14=, $12, $pop82
+; NO-SIMD128-NEXT:    i32.sub $push15=, $pop14, $16
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $11
 ; NO-SIMD128-NEXT:    i32.const $push81=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push80=, $pop56, $pop81
+; NO-SIMD128-NEXT:    i32.shr_s $push80=, $pop16, $pop81
 ; NO-SIMD128-NEXT:    local.tee $push79=, $16=, $pop80
-; NO-SIMD128-NEXT:    i32.xor $push57=, $4, $pop79
-; NO-SIMD128-NEXT:    i32.sub $push58=, $pop57, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop60), $pop58
-; NO-SIMD128-NEXT:    i32.extend8_s $push61=, $3
+; NO-SIMD128-NEXT:    i32.xor $push17=, $11, $pop79
+; NO-SIMD128-NEXT:    i32.sub $push18=, $pop17, $16
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop18
+; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $10
 ; NO-SIMD128-NEXT:    i32.const $push78=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push77=, $pop61, $pop78
+; NO-SIMD128-NEXT:    i32.shr_s $push77=, $pop19, $pop78
 ; NO-SIMD128-NEXT:    local.tee $push76=, $16=, $pop77
-; NO-SIMD128-NEXT:    i32.xor $push62=, $3, $pop76
-; NO-SIMD128-NEXT:    i32.sub $push63=, $pop62, $16
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop63
-; NO-SIMD128-NEXT:    i32.extend8_s $push64=, $2
+; NO-SIMD128-NEXT:    i32.xor $push20=, $10, $pop76
+; NO-SIMD128-NEXT:    i32.sub $push21=, $pop20, $16
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop21
+; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $9
 ; NO-SIMD128-NEXT:    i32.const $push75=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push74=, $pop64, $pop75
+; NO-SIMD128-NEXT:    i32.shr_s $push74=, $pop22, $pop75
 ; NO-SIMD128-NEXT:    local.tee $push73=, $16=, $pop74
-; NO-SIMD128-NEXT:    i32.xor $push65=, $2, $pop73
-; NO-SIMD128-NEXT:    i32.sub $push66=, $pop65, $16
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop66
-; NO-SIMD128-NEXT:    i32.extend8_s $push67=, $1
+; NO-SIMD128-NEXT:    i32.xor $push23=, $9, $pop73
+; NO-SIMD128-NEXT:    i32.sub $push24=, $pop23, $16
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop24
+; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $8
 ; NO-SIMD128-NEXT:    i32.const $push72=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push71=, $pop67, $pop72
+; NO-SIMD128-NEXT:    i32.shr_s $push71=, $pop25, $pop72
 ; NO-SIMD128-NEXT:    local.tee $push70=, $16=, $pop71
-; NO-SIMD128-NEXT:    i32.xor $push68=, $1, $pop70
-; NO-SIMD128-NEXT:    i32.sub $push69=, $pop68, $16
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop69
+; NO-SIMD128-NEXT:    i32.xor $push26=, $8, $pop70
+; NO-SIMD128-NEXT:    i32.sub $push27=, $pop26, $16
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend8_s $push28=, $7
+; NO-SIMD128-NEXT:    i32.const $push69=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push68=, $pop28, $pop69
+; NO-SIMD128-NEXT:    local.tee $push67=, $16=, $pop68
+; NO-SIMD128-NEXT:    i32.xor $push29=, $7, $pop67
+; NO-SIMD128-NEXT:    i32.sub $push30=, $pop29, $16
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop30
+; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $6
+; NO-SIMD128-NEXT:    i32.const $push66=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push65=, $pop31, $pop66
+; NO-SIMD128-NEXT:    local.tee $push64=, $16=, $pop65
+; NO-SIMD128-NEXT:    i32.xor $push32=, $6, $pop64
+; NO-SIMD128-NEXT:    i32.sub $push33=, $pop32, $16
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop33
+; NO-SIMD128-NEXT:    i32.extend8_s $push34=, $5
+; NO-SIMD128-NEXT:    i32.const $push63=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push62=, $pop34, $pop63
+; NO-SIMD128-NEXT:    local.tee $push61=, $16=, $pop62
+; NO-SIMD128-NEXT:    i32.xor $push35=, $5, $pop61
+; NO-SIMD128-NEXT:    i32.sub $push36=, $pop35, $16
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop36
+; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $4
+; NO-SIMD128-NEXT:    i32.const $push60=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push59=, $pop37, $pop60
+; NO-SIMD128-NEXT:    local.tee $push58=, $16=, $pop59
+; NO-SIMD128-NEXT:    i32.xor $push38=, $4, $pop58
+; NO-SIMD128-NEXT:    i32.sub $push39=, $pop38, $16
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop39
+; NO-SIMD128-NEXT:    i32.extend8_s $push40=, $3
+; NO-SIMD128-NEXT:    i32.const $push57=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push56=, $pop40, $pop57
+; NO-SIMD128-NEXT:    local.tee $push55=, $16=, $pop56
+; NO-SIMD128-NEXT:    i32.xor $push41=, $3, $pop55
+; NO-SIMD128-NEXT:    i32.sub $push42=, $pop41, $16
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop42
+; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $2
+; NO-SIMD128-NEXT:    i32.const $push54=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push53=, $pop43, $pop54
+; NO-SIMD128-NEXT:    local.tee $push52=, $16=, $pop53
+; NO-SIMD128-NEXT:    i32.xor $push44=, $2, $pop52
+; NO-SIMD128-NEXT:    i32.sub $push45=, $pop44, $16
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop45
+; NO-SIMD128-NEXT:    i32.extend8_s $push46=, $1
+; NO-SIMD128-NEXT:    i32.const $push51=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push50=, $pop46, $pop51
+; NO-SIMD128-NEXT:    local.tee $push49=, $16=, $pop50
+; NO-SIMD128-NEXT:    i32.xor $push47=, $1, $pop49
+; NO-SIMD128-NEXT:    i32.sub $push48=, $pop47, $16
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop48
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: abs_v16i8:
@@ -2420,138 +2002,116 @@ define <16 x i8> @abs_v16i8(<16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push0=, $1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push1=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push117=, $pop0, $pop1
-; NO-SIMD128-FAST-NEXT:    local.tee $push116=, $17=, $pop117
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop116
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push95=, $pop0, $pop1
+; NO-SIMD128-FAST-NEXT:    local.tee $push94=, $17=, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop94
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop2, $17
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push4=, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push114=, $pop4, $pop115
-; NO-SIMD128-FAST-NEXT:    local.tee $push113=, $1=, $pop114
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop113
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push92=, $pop4, $pop93
+; NO-SIMD128-FAST-NEXT:    local.tee $push91=, $1=, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop91
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push7=, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push111=, $pop7, $pop112
-; NO-SIMD128-FAST-NEXT:    local.tee $push110=, $2=, $pop111
-; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $3, $pop110
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push89=, $pop7, $pop90
+; NO-SIMD128-FAST-NEXT:    local.tee $push88=, $2=, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $3, $pop88
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $pop8, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push10=, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push108=, $pop10, $pop109
-; NO-SIMD128-FAST-NEXT:    local.tee $push107=, $3=, $pop108
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $4, $pop107
-; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $pop11, $3
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push105=, $pop15, $pop106
-; NO-SIMD128-FAST-NEXT:    local.tee $push104=, $4=, $pop105
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $5, $pop104
-; NO-SIMD128-FAST-NEXT:    i32.sub $push17=, $pop16, $4
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push102=, $pop18, $pop103
-; NO-SIMD128-FAST-NEXT:    local.tee $push101=, $5=, $pop102
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $6, $pop101
-; NO-SIMD128-FAST-NEXT:    i32.sub $push20=, $pop19, $5
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push99=, $pop23, $pop100
-; NO-SIMD128-FAST-NEXT:    local.tee $push98=, $6=, $pop99
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $7, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.sub $push25=, $pop24, $6
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push95=, $pop28, $pop96
-; NO-SIMD128-FAST-NEXT:    local.tee $push94=, $7=, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $8, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.sub $push30=, $pop29, $7
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop30
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push32=, $9
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push92=, $pop32, $pop93
-; NO-SIMD128-FAST-NEXT:    local.tee $push91=, $8=, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $9, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.sub $push34=, $pop33, $8
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push35=, $10
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push89=, $pop35, $pop90
-; NO-SIMD128-FAST-NEXT:    local.tee $push88=, $9=, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $10, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.sub $push37=, $pop36, $9
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop39), $pop37
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $0, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $11
 ; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push86=, $pop40, $pop87
-; NO-SIMD128-FAST-NEXT:    local.tee $push85=, $10=, $pop86
-; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $11, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.sub $push42=, $pop41, $10
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop44), $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $0, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $12
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push86=, $pop10, $pop87
+; NO-SIMD128-FAST-NEXT:    local.tee $push85=, $3=, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $4, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $pop11, $3
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $5
 ; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push83=, $pop45, $pop84
-; NO-SIMD128-FAST-NEXT:    local.tee $push82=, $11=, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $12, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.sub $push47=, $pop46, $11
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop49), $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push50=, $13
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push83=, $pop13, $pop84
+; NO-SIMD128-FAST-NEXT:    local.tee $push82=, $4=, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $5, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $pop14, $4
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push80=, $pop50, $pop81
-; NO-SIMD128-FAST-NEXT:    local.tee $push79=, $12=, $pop80
-; NO-SIMD128-FAST-NEXT:    i32.xor $push51=, $13, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.sub $push52=, $pop51, $12
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop54), $pop52
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $0, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push55=, $14
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push80=, $pop16, $pop81
+; NO-SIMD128-FAST-NEXT:    local.tee $push79=, $5=, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $6, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.sub $push18=, $pop17, $5
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $7
 ; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push77=, $pop55, $pop78
-; NO-SIMD128-FAST-NEXT:    local.tee $push76=, $13=, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.xor $push56=, $14, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.sub $push57=, $pop56, $13
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop59), $pop57
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push64=, $0, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push60=, $15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push77=, $pop19, $pop78
+; NO-SIMD128-FAST-NEXT:    local.tee $push76=, $6=, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $7, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.sub $push21=, $pop20, $6
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push22=, $8
 ; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push74=, $pop60, $pop75
-; NO-SIMD128-FAST-NEXT:    local.tee $push73=, $14=, $pop74
-; NO-SIMD128-FAST-NEXT:    i32.xor $push61=, $15, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.sub $push62=, $pop61, $14
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop64), $pop62
-; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push69=, $0, $pop68
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push65=, $16
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push74=, $pop22, $pop75
+; NO-SIMD128-FAST-NEXT:    local.tee $push73=, $7=, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $8, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.sub $push24=, $pop23, $7
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $9
 ; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push71=, $pop65, $pop72
-; NO-SIMD128-FAST-NEXT:    local.tee $push70=, $0=, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.xor $push66=, $16, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.sub $push67=, $pop66, $0
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop69), $pop67
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push71=, $pop25, $pop72
+; NO-SIMD128-FAST-NEXT:    local.tee $push70=, $8=, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $9, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.sub $push27=, $pop26, $8
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $10
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push68=, $pop28, $pop69
+; NO-SIMD128-FAST-NEXT:    local.tee $push67=, $9=, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $10, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.sub $push30=, $pop29, $9
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push31=, $11
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push65=, $pop31, $pop66
+; NO-SIMD128-FAST-NEXT:    local.tee $push64=, $10=, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $11, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.sub $push33=, $pop32, $10
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop33
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push34=, $12
+; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push62=, $pop34, $pop63
+; NO-SIMD128-FAST-NEXT:    local.tee $push61=, $11=, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $12, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.sub $push36=, $pop35, $11
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push37=, $13
+; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push59=, $pop37, $pop60
+; NO-SIMD128-FAST-NEXT:    local.tee $push58=, $12=, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $13, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.sub $push39=, $pop38, $12
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop39
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $14
+; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push56=, $pop40, $pop57
+; NO-SIMD128-FAST-NEXT:    local.tee $push55=, $13=, $pop56
+; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $14, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.sub $push42=, $pop41, $13
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop42
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push43=, $15
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push53=, $pop43, $pop54
+; NO-SIMD128-FAST-NEXT:    local.tee $push52=, $14=, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $15, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.sub $push45=, $pop44, $14
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push46=, $16
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push50=, $pop46, $pop51
+; NO-SIMD128-FAST-NEXT:    local.tee $push49=, $15=, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.xor $push47=, $16, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.sub $push48=, $pop47, $15
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop48
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <16 x i8> zeroinitializer, %x
   %b = icmp slt <16 x i8> %x, zeroinitializer
@@ -2576,75 +2136,53 @@ define <16 x i8> @neg_v16i8(<16 x i8> %x) {
 ; NO-SIMD128:         .functype neg_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $9
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push53=, 0
-; NO-SIMD128-NEXT:    i32.sub $push2=, $pop53, $5
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push52=, 0
-; NO-SIMD128-NEXT:    i32.sub $push3=, $pop52, $3
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push51=, 0
-; NO-SIMD128-NEXT:    i32.sub $push4=, $pop51, $2
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push50=, 0
-; NO-SIMD128-NEXT:    i32.sub $push5=, $pop50, $1
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push7=, 15
-; NO-SIMD128-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-NEXT:    i32.const $push49=, 0
-; NO-SIMD128-NEXT:    i32.sub $push6=, $pop49, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop8), $pop6
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.const $push48=, 0
-; NO-SIMD128-NEXT:    i32.sub $push9=, $pop48, $15
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 13
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push47=, 0
-; NO-SIMD128-NEXT:    i32.sub $push12=, $pop47, $14
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push46=, 0
-; NO-SIMD128-NEXT:    i32.sub $push15=, $pop46, $13
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 11
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push45=, 0
-; NO-SIMD128-NEXT:    i32.sub $push18=, $pop45, $12
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push22=, 10
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.const $push44=, 0
-; NO-SIMD128-NEXT:    i32.sub $push21=, $pop44, $11
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push25=, 9
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push43=, 0
-; NO-SIMD128-NEXT:    i32.sub $push24=, $pop43, $10
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push28=, 7
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.const $push42=, 0
-; NO-SIMD128-NEXT:    i32.sub $push27=, $pop42, $8
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-NEXT:    i32.const $push41=, 0
-; NO-SIMD128-NEXT:    i32.sub $push30=, $pop41, $7
-; NO-SIMD128-NEXT:    i32.store8 0($pop32), $pop30
-; NO-SIMD128-NEXT:    i32.const $push34=, 5
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.const $push40=, 0
-; NO-SIMD128-NEXT:    i32.sub $push33=, $pop40, $6
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push37=, 3
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.const $push39=, 0
-; NO-SIMD128-NEXT:    i32.sub $push36=, $pop39, $4
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
+; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $16
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push31=, 0
+; NO-SIMD128-NEXT:    i32.sub $push2=, $pop31, $15
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push30=, 0
+; NO-SIMD128-NEXT:    i32.sub $push3=, $pop30, $14
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push29=, 0
+; NO-SIMD128-NEXT:    i32.sub $push4=, $pop29, $13
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push28=, 0
+; NO-SIMD128-NEXT:    i32.sub $push5=, $pop28, $12
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push27=, 0
+; NO-SIMD128-NEXT:    i32.sub $push6=, $pop27, $11
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push26=, 0
+; NO-SIMD128-NEXT:    i32.sub $push7=, $pop26, $10
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push25=, 0
+; NO-SIMD128-NEXT:    i32.sub $push8=, $pop25, $9
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push24=, 0
+; NO-SIMD128-NEXT:    i32.sub $push9=, $pop24, $8
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push23=, 0
+; NO-SIMD128-NEXT:    i32.sub $push10=, $pop23, $7
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push22=, 0
+; NO-SIMD128-NEXT:    i32.sub $push11=, $pop22, $6
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop11
+; NO-SIMD128-NEXT:    i32.const $push21=, 0
+; NO-SIMD128-NEXT:    i32.sub $push12=, $pop21, $5
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push20=, 0
+; NO-SIMD128-NEXT:    i32.sub $push13=, $pop20, $4
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop13
+; NO-SIMD128-NEXT:    i32.const $push19=, 0
+; NO-SIMD128-NEXT:    i32.sub $push14=, $pop19, $3
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push18=, 0
+; NO-SIMD128-NEXT:    i32.sub $push15=, $pop18, $2
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push17=, 0
+; NO-SIMD128-NEXT:    i32.sub $push16=, $pop17, $1
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: neg_v16i8:
@@ -2653,73 +2191,51 @@ define <16 x i8> @neg_v16i8(<16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 0
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push1=, $pop0, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop53, $2
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop31, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop52, $3
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop30, $3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop51, $4
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $pop50, $5
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push10=, $pop49, $6
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push13=, $pop48, $7
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push16=, $pop47, $8
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push17=, $pop46, $9
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push20=, $pop45, $10
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop19), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push23=, $pop44, $11
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push26=, $pop43, $12
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push29=, $pop42, $13
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push32=, $pop41, $14
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push35=, $pop40, $15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push38=, $pop39, $16
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $pop29, $4
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $pop28, $5
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop27, $6
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $pop26, $7
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push8=, $pop25, $8
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $pop24, $9
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push10=, $pop23, $10
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push11=, $pop22, $11
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $pop21, $12
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push13=, $pop20, $13
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push14=, $pop19, $14
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $pop18, $15
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push16=, $pop17, $16
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
                       i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>,
@@ -2744,124 +2260,80 @@ define <16 x i8> @shl_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128:         .functype shl_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push40=, $17, $pop0
-; NO-SIMD128-NEXT:    local.tee $push39=, $17=, $pop40
-; NO-SIMD128-NEXT:    i32.shl $push1=, $9, $pop39
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.shl $push2=, $5, $17
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.shl $push3=, $3, $17
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.shl $push4=, $2, $17
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-NEXT:    i32.shl $push5=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push7=, 15
-; NO-SIMD128-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-NEXT:    i32.shl $push6=, $16, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop8), $pop6
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.shl $push9=, $15, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 13
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.shl $push12=, $14, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.shl $push15=, $13, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 11
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.shl $push18=, $12, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push22=, 10
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.shl $push21=, $11, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push25=, 9
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.shl $push24=, $10, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push28=, 7
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.shl $push27=, $8, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-NEXT:    i32.shl $push30=, $7, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop32), $pop30
-; NO-SIMD128-NEXT:    i32.const $push34=, 5
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.shl $push33=, $6, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push37=, 3
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.shl $push36=, $4, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
+; NO-SIMD128-NEXT:    i32.and $push18=, $17, $pop0
+; NO-SIMD128-NEXT:    local.tee $push17=, $17=, $pop18
+; NO-SIMD128-NEXT:    i32.shl $push1=, $16, $pop17
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop1
+; NO-SIMD128-NEXT:    i32.shl $push2=, $15, $17
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.shl $push3=, $14, $17
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop3
+; NO-SIMD128-NEXT:    i32.shl $push4=, $13, $17
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.shl $push5=, $12, $17
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop5
+; NO-SIMD128-NEXT:    i32.shl $push6=, $11, $17
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.shl $push7=, $10, $17
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop7
+; NO-SIMD128-NEXT:    i32.shl $push8=, $9, $17
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.shl $push9=, $8, $17
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop9
+; NO-SIMD128-NEXT:    i32.shl $push10=, $7, $17
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.shl $push11=, $6, $17
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop11
+; NO-SIMD128-NEXT:    i32.shl $push12=, $5, $17
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.shl $push13=, $4, $17
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop13
+; NO-SIMD128-NEXT:    i32.shl $push14=, $3, $17
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.shl $push15=, $2, $17
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop15
+; NO-SIMD128-NEXT:    i32.shl $push16=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_v16i8:
 ; NO-SIMD128-FAST:         .functype shl_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $17, $pop0
-; NO-SIMD128-FAST-NEXT:    local.tee $push39=, $17=, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $2, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $17, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push17=, $17=, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $2, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $1, $17
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $17
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $5, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $6, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $7, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.shl $push17=, $9, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.shl $push20=, $10, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop19), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.shl $push23=, $11, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.shl $push26=, $12, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.shl $push29=, $13, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.shl $push32=, $14, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.shl $push35=, $15, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.shl $push38=, $16, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $4, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $5, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $6, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $7, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $8, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.shl $push9=, $9, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $10, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.shl $push11=, $11, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $12, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $13, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.shl $push14=, $14, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.shl $push15=, $15, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $16, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <16 x i8> undef, i8 %x, i32 0
   %s = shufflevector <16 x i8> %t, <16 x i8> undef,
@@ -2890,75 +2362,53 @@ define <16 x i8> @shl_const_v16i8(<16 x i8> %v) {
 ; NO-SIMD128:         .functype shl_const_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 5
-; NO-SIMD128-NEXT:    i32.shl $push1=, $9, $pop0
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push53=, 5
-; NO-SIMD128-NEXT:    i32.shl $push2=, $5, $pop53
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push52=, 5
-; NO-SIMD128-NEXT:    i32.shl $push3=, $3, $pop52
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push51=, 5
-; NO-SIMD128-NEXT:    i32.shl $push4=, $2, $pop51
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push50=, 5
-; NO-SIMD128-NEXT:    i32.shl $push5=, $1, $pop50
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push7=, 15
-; NO-SIMD128-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-NEXT:    i32.shl $push6=, $16, $pop49
-; NO-SIMD128-NEXT:    i32.store8 0($pop8), $pop6
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.const $push48=, 5
-; NO-SIMD128-NEXT:    i32.shl $push9=, $15, $pop48
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 13
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push47=, 5
-; NO-SIMD128-NEXT:    i32.shl $push12=, $14, $pop47
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push46=, 5
-; NO-SIMD128-NEXT:    i32.shl $push15=, $13, $pop46
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 11
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push45=, 5
-; NO-SIMD128-NEXT:    i32.shl $push18=, $12, $pop45
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push22=, 10
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.const $push44=, 5
-; NO-SIMD128-NEXT:    i32.shl $push21=, $11, $pop44
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push25=, 9
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push43=, 5
-; NO-SIMD128-NEXT:    i32.shl $push24=, $10, $pop43
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push28=, 7
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.const $push42=, 5
-; NO-SIMD128-NEXT:    i32.shl $push27=, $8, $pop42
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-NEXT:    i32.const $push41=, 5
-; NO-SIMD128-NEXT:    i32.shl $push30=, $7, $pop41
-; NO-SIMD128-NEXT:    i32.store8 0($pop32), $pop30
-; NO-SIMD128-NEXT:    i32.const $push40=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop40
-; NO-SIMD128-NEXT:    i32.const $push39=, 5
-; NO-SIMD128-NEXT:    i32.shl $push33=, $6, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop33
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.const $push38=, 5
-; NO-SIMD128-NEXT:    i32.shl $push35=, $4, $pop38
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.shl $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push31=, 5
+; NO-SIMD128-NEXT:    i32.shl $push2=, $15, $pop31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push30=, 5
+; NO-SIMD128-NEXT:    i32.shl $push3=, $14, $pop30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push29=, 5
+; NO-SIMD128-NEXT:    i32.shl $push4=, $13, $pop29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push28=, 5
+; NO-SIMD128-NEXT:    i32.shl $push5=, $12, $pop28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push27=, 5
+; NO-SIMD128-NEXT:    i32.shl $push6=, $11, $pop27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push26=, 5
+; NO-SIMD128-NEXT:    i32.shl $push7=, $10, $pop26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push25=, 5
+; NO-SIMD128-NEXT:    i32.shl $push8=, $9, $pop25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push24=, 5
+; NO-SIMD128-NEXT:    i32.shl $push9=, $8, $pop24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push23=, 5
+; NO-SIMD128-NEXT:    i32.shl $push10=, $7, $pop23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push22=, 5
+; NO-SIMD128-NEXT:    i32.shl $push11=, $6, $pop22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop11
+; NO-SIMD128-NEXT:    i32.const $push21=, 5
+; NO-SIMD128-NEXT:    i32.shl $push12=, $5, $pop21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push20=, 5
+; NO-SIMD128-NEXT:    i32.shl $push13=, $4, $pop20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop13
+; NO-SIMD128-NEXT:    i32.const $push19=, 5
+; NO-SIMD128-NEXT:    i32.shl $push14=, $3, $pop19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push18=, 5
+; NO-SIMD128-NEXT:    i32.shl $push15=, $2, $pop18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push17=, 5
+; NO-SIMD128-NEXT:    i32.shl $push16=, $1, $pop17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_const_v16i8:
@@ -2967,73 +2417,51 @@ define <16 x i8> @shl_const_v16i8(<16 x i8> %v) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 5
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop31
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop30
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $5, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push9=, $6, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $7, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push15=, $8, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $9, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push19=, $10, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push22=, $11, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push25=, $12, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push28=, $13, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push31=, $14, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push34=, $15, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push37=, $16, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $4, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $5, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $6, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $7, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $8, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push9=, $9, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $10, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push11=, $11, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $12, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $13, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push14=, $14, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push15=, $15, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $16, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <16 x i8> %v,
     <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5,
@@ -3248,91 +2676,69 @@ define <16 x i8> @shl_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128:         .functype shl_vec_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $25, $pop0
-; NO-SIMD128-NEXT:    i32.shl $push2=, $9, $pop1
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push69=, 255
-; NO-SIMD128-NEXT:    i32.and $push3=, $21, $pop69
-; NO-SIMD128-NEXT:    i32.shl $push4=, $5, $pop3
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push68=, 255
-; NO-SIMD128-NEXT:    i32.and $push5=, $19, $pop68
-; NO-SIMD128-NEXT:    i32.shl $push6=, $3, $pop5
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push67=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $18, $pop67
-; NO-SIMD128-NEXT:    i32.shl $push8=, $2, $pop7
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push66=, 255
-; NO-SIMD128-NEXT:    i32.and $push9=, $17, $pop66
-; NO-SIMD128-NEXT:    i32.shl $push10=, $1, $pop9
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop10
-; NO-SIMD128-NEXT:    i32.const $push13=, 15
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push65=, 255
-; NO-SIMD128-NEXT:    i32.and $push11=, $32, $pop65
-; NO-SIMD128-NEXT:    i32.shl $push12=, $16, $pop11
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push17=, 14
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push64=, 255
-; NO-SIMD128-NEXT:    i32.and $push15=, $31, $pop64
-; NO-SIMD128-NEXT:    i32.shl $push16=, $15, $pop15
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push21=, 13
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.const $push63=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $30, $pop63
-; NO-SIMD128-NEXT:    i32.shl $push20=, $14, $pop19
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push25=, 12
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push62=, 255
-; NO-SIMD128-NEXT:    i32.and $push23=, $29, $pop62
-; NO-SIMD128-NEXT:    i32.shl $push24=, $13, $pop23
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push61=, 255
-; NO-SIMD128-NEXT:    i32.and $push27=, $28, $pop61
-; NO-SIMD128-NEXT:    i32.shl $push28=, $12, $pop27
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push33=, 10
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.const $push60=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $27, $pop60
-; NO-SIMD128-NEXT:    i32.shl $push32=, $11, $pop31
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push37=, 9
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.const $push59=, 255
-; NO-SIMD128-NEXT:    i32.and $push35=, $26, $pop59
-; NO-SIMD128-NEXT:    i32.shl $push36=, $10, $pop35
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
-; NO-SIMD128-NEXT:    i32.const $push41=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.const $push58=, 255
-; NO-SIMD128-NEXT:    i32.and $push39=, $24, $pop58
-; NO-SIMD128-NEXT:    i32.shl $push40=, $8, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push45=, 6
-; NO-SIMD128-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-NEXT:    i32.const $push57=, 255
-; NO-SIMD128-NEXT:    i32.and $push43=, $23, $pop57
-; NO-SIMD128-NEXT:    i32.shl $push44=, $7, $pop43
-; NO-SIMD128-NEXT:    i32.store8 0($pop46), $pop44
-; NO-SIMD128-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.const $push56=, 255
-; NO-SIMD128-NEXT:    i32.and $push47=, $22, $pop56
-; NO-SIMD128-NEXT:    i32.shl $push48=, $6, $pop47
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push53=, 3
-; NO-SIMD128-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-NEXT:    i32.const $push55=, 255
-; NO-SIMD128-NEXT:    i32.and $push51=, $20, $pop55
-; NO-SIMD128-NEXT:    i32.shl $push52=, $4, $pop51
-; NO-SIMD128-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop0
+; NO-SIMD128-NEXT:    i32.shl $push2=, $16, $pop1
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push47=, 255
+; NO-SIMD128-NEXT:    i32.and $push3=, $31, $pop47
+; NO-SIMD128-NEXT:    i32.shl $push4=, $15, $pop3
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push46=, 255
+; NO-SIMD128-NEXT:    i32.and $push5=, $30, $pop46
+; NO-SIMD128-NEXT:    i32.shl $push6=, $14, $pop5
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push45=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $29, $pop45
+; NO-SIMD128-NEXT:    i32.shl $push8=, $13, $pop7
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push44=, 255
+; NO-SIMD128-NEXT:    i32.and $push9=, $28, $pop44
+; NO-SIMD128-NEXT:    i32.shl $push10=, $12, $pop9
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push43=, 255
+; NO-SIMD128-NEXT:    i32.and $push11=, $27, $pop43
+; NO-SIMD128-NEXT:    i32.shl $push12=, $11, $pop11
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push42=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $26, $pop42
+; NO-SIMD128-NEXT:    i32.shl $push14=, $10, $pop13
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push41=, 255
+; NO-SIMD128-NEXT:    i32.and $push15=, $25, $pop41
+; NO-SIMD128-NEXT:    i32.shl $push16=, $9, $pop15
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $24, $pop40
+; NO-SIMD128-NEXT:    i32.shl $push18=, $8, $pop17
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $23, $pop39
+; NO-SIMD128-NEXT:    i32.shl $push20=, $7, $pop19
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push38=, 255
+; NO-SIMD128-NEXT:    i32.and $push21=, $22, $pop38
+; NO-SIMD128-NEXT:    i32.shl $push22=, $6, $pop21
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop22
+; NO-SIMD128-NEXT:    i32.const $push37=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $21, $pop37
+; NO-SIMD128-NEXT:    i32.shl $push24=, $5, $pop23
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push36=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $20, $pop36
+; NO-SIMD128-NEXT:    i32.shl $push26=, $4, $pop25
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop26
+; NO-SIMD128-NEXT:    i32.const $push35=, 255
+; NO-SIMD128-NEXT:    i32.and $push27=, $19, $pop35
+; NO-SIMD128-NEXT:    i32.shl $push28=, $3, $pop27
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, 255
+; NO-SIMD128-NEXT:    i32.and $push29=, $18, $pop34
+; NO-SIMD128-NEXT:    i32.shl $push30=, $2, $pop29
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop30
+; NO-SIMD128-NEXT:    i32.const $push33=, 255
+; NO-SIMD128-NEXT:    i32.and $push31=, $17, $pop33
+; NO-SIMD128-NEXT:    i32.shl $push32=, $1, $pop31
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_vec_v16i8:
@@ -3342,88 +2748,66 @@ define <16 x i8> @shl_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $18, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $18, $pop47
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $2, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $19, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $19, $pop46
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $3, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $20, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $4, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $21, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $5, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $22, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $6, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $23, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.shl $push20=, $7, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $24, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.shl $push24=, $8, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop24
-; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $25, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.shl $push26=, $9, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $26, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.shl $push30=, $10, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $27, $pop60
-; NO-SIMD128-FAST-NEXT:    i32.shl $push34=, $11, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop32), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $28, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.shl $push38=, $12, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $29, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.shl $push42=, $13, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $0, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $30, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.shl $push46=, $14, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop44), $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $0, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $31, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.shl $push50=, $15, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop48), $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $32, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.shl $push54=, $16, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop52), $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $20, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $21, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $5, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $22, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $23, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.shl $push14=, $7, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $24, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $25, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.shl $push18=, $9, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.shl $push20=, $10, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $27, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.shl $push22=, $11, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $28, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.shl $push24=, $12, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $29, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.shl $push26=, $13, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $30, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.shl $push28=, $14, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $31, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.shl $push30=, $15, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $32, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.shl $push32=, $16, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <16 x i8> %v, %x
   ret <16 x i8> %a
@@ -3445,79 +2829,57 @@ define <16 x i8> @shr_s_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-LABEL: shr_s_v16i8:
 ; NO-SIMD128:         .functype shr_s_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $16
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push56=, $17, $pop0
-; NO-SIMD128-NEXT:    local.tee $push55=, $17=, $pop56
-; NO-SIMD128-NEXT:    i32.shr_s $push2=, $pop1, $pop55
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $5
+; NO-SIMD128-NEXT:    i32.and $push34=, $17, $pop0
+; NO-SIMD128-NEXT:    local.tee $push33=, $17=, $pop34
+; NO-SIMD128-NEXT:    i32.shr_s $push2=, $pop1, $pop33
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $15
 ; NO-SIMD128-NEXT:    i32.shr_s $push4=, $pop3, $17
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $3
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $14
 ; NO-SIMD128-NEXT:    i32.shr_s $push6=, $pop5, $17
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $2
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $13
 ; NO-SIMD128-NEXT:    i32.shr_s $push8=, $pop7, $17
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $1
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $12
 ; NO-SIMD128-NEXT:    i32.shr_s $push10=, $pop9, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop10
-; NO-SIMD128-NEXT:    i32.const $push13=, 15
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $16
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop10
+; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $11
 ; NO-SIMD128-NEXT:    i32.shr_s $push12=, $pop11, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push17=, 14
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.extend8_s $push15=, $15
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $10
+; NO-SIMD128-NEXT:    i32.shr_s $push14=, $pop13, $17
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop14
+; NO-SIMD128-NEXT:    i32.extend8_s $push15=, $9
 ; NO-SIMD128-NEXT:    i32.shr_s $push16=, $pop15, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push21=, 13
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $14
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $8
+; NO-SIMD128-NEXT:    i32.shr_s $push18=, $pop17, $17
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop18
+; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $7
 ; NO-SIMD128-NEXT:    i32.shr_s $push20=, $pop19, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push25=, 12
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $13
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $6
+; NO-SIMD128-NEXT:    i32.shr_s $push22=, $pop21, $17
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop22
+; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $5
 ; NO-SIMD128-NEXT:    i32.shr_s $push24=, $pop23, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $12
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $4
+; NO-SIMD128-NEXT:    i32.shr_s $push26=, $pop25, $17
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop26
+; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $3
 ; NO-SIMD128-NEXT:    i32.shr_s $push28=, $pop27, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push33=, 10
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $11
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.extend8_s $push29=, $2
+; NO-SIMD128-NEXT:    i32.shr_s $push30=, $pop29, $17
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop30
+; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $1
 ; NO-SIMD128-NEXT:    i32.shr_s $push32=, $pop31, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push37=, 9
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.extend8_s $push35=, $10
-; NO-SIMD128-NEXT:    i32.shr_s $push36=, $pop35, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
-; NO-SIMD128-NEXT:    i32.const $push41=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.extend8_s $push39=, $8
-; NO-SIMD128-NEXT:    i32.shr_s $push40=, $pop39, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push45=, 6
-; NO-SIMD128-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $7
-; NO-SIMD128-NEXT:    i32.shr_s $push44=, $pop43, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop46), $pop44
-; NO-SIMD128-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $6
-; NO-SIMD128-NEXT:    i32.shr_s $push48=, $pop47, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push53=, 3
-; NO-SIMD128-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-NEXT:    i32.extend8_s $push51=, $4
-; NO-SIMD128-NEXT:    i32.shr_s $push52=, $pop51, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_v16i8:
@@ -3525,9 +2887,9 @@ define <16 x i8> @shr_s_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push1=, $1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $17, $pop0
-; NO-SIMD128-FAST-NEXT:    local.tee $push55=, $1=, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $pop1, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $17, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push33=, $1=, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $pop1, $pop33
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push3=, $2
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push4=, $pop3, $1
@@ -3535,67 +2897,45 @@ define <16 x i8> @shr_s_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push5=, $3
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push9=, $4
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push7=, $4
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push8=, $pop7, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push9=, $5
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push10=, $pop9, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $5
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $6
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $pop11, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $6
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push14=, $pop13, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $8
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push16=, $pop15, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $7
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push17=, $9
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push18=, $pop17, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $10
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push20=, $pop19, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $8
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $11
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push22=, $pop21, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $12
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push24=, $pop23, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop24
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $9
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $13
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push26=, $pop25, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $10
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push27=, $14
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push28=, $pop27, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $15
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push30=, $pop29, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push33=, $11
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push34=, $pop33, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop32), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push37=, $12
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push38=, $pop37, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $13
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push42=, $pop41, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $0, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $14
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push46=, $pop45, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop44), $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $0, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push49=, $15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push50=, $pop49, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop48), $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push53=, $16
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push54=, $pop53, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop52), $pop54
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push31=, $16
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push32=, $pop31, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <16 x i8> undef, i8 %x, i32 0
   %s = shufflevector <16 x i8> %t, <16 x i8> undef,
@@ -3811,108 +3151,86 @@ define <16 x i8> @shr_s_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128-LABEL: shr_s_vec_v16i8:
 ; NO-SIMD128:         .functype shr_s_vec_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend8_s $push2=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push2=, $16
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $25, $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop0
 ; NO-SIMD128-NEXT:    i32.shr_s $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $5
-; NO-SIMD128-NEXT:    i32.const $push85=, 255
-; NO-SIMD128-NEXT:    i32.and $push4=, $21, $pop85
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $15
+; NO-SIMD128-NEXT:    i32.const $push63=, 255
+; NO-SIMD128-NEXT:    i32.and $push4=, $31, $pop63
 ; NO-SIMD128-NEXT:    i32.shr_s $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.extend8_s $push8=, $3
-; NO-SIMD128-NEXT:    i32.const $push84=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $19, $pop84
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend8_s $push8=, $14
+; NO-SIMD128-NEXT:    i32.const $push62=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $30, $pop62
 ; NO-SIMD128-NEXT:    i32.shr_s $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $2
-; NO-SIMD128-NEXT:    i32.const $push83=, 255
-; NO-SIMD128-NEXT:    i32.and $push10=, $18, $pop83
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop9
+; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $13
+; NO-SIMD128-NEXT:    i32.const $push61=, 255
+; NO-SIMD128-NEXT:    i32.and $push10=, $29, $pop61
 ; NO-SIMD128-NEXT:    i32.shr_s $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop12
-; NO-SIMD128-NEXT:    i32.extend8_s $push14=, $1
-; NO-SIMD128-NEXT:    i32.const $push82=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $17, $pop82
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend8_s $push14=, $12
+; NO-SIMD128-NEXT:    i32.const $push60=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $28, $pop60
 ; NO-SIMD128-NEXT:    i32.shr_s $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 15
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $16
-; NO-SIMD128-NEXT:    i32.const $push81=, 255
-; NO-SIMD128-NEXT:    i32.and $push16=, $32, $pop81
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $11
+; NO-SIMD128-NEXT:    i32.const $push59=, 255
+; NO-SIMD128-NEXT:    i32.and $push16=, $27, $pop59
 ; NO-SIMD128-NEXT:    i32.shr_s $push18=, $pop17, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push24=, 14
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $15
-; NO-SIMD128-NEXT:    i32.const $push80=, 255
-; NO-SIMD128-NEXT:    i32.and $push21=, $31, $pop80
-; NO-SIMD128-NEXT:    i32.shr_s $push23=, $pop22, $pop21
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $14
-; NO-SIMD128-NEXT:    i32.const $push79=, 255
-; NO-SIMD128-NEXT:    i32.and $push26=, $30, $pop79
-; NO-SIMD128-NEXT:    i32.shr_s $push28=, $pop27, $pop26
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push34=, 12
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.extend8_s $push32=, $13
-; NO-SIMD128-NEXT:    i32.const $push78=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $29, $pop78
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop18
+; NO-SIMD128-NEXT:    i32.extend8_s $push20=, $10
+; NO-SIMD128-NEXT:    i32.const $push58=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $26, $pop58
+; NO-SIMD128-NEXT:    i32.shr_s $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop21
+; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $9
+; NO-SIMD128-NEXT:    i32.const $push57=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $25, $pop57
+; NO-SIMD128-NEXT:    i32.shr_s $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop24
+; NO-SIMD128-NEXT:    i32.extend8_s $push26=, $8
+; NO-SIMD128-NEXT:    i32.const $push56=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $24, $pop56
+; NO-SIMD128-NEXT:    i32.shr_s $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend8_s $push29=, $7
+; NO-SIMD128-NEXT:    i32.const $push55=, 255
+; NO-SIMD128-NEXT:    i32.and $push28=, $23, $pop55
+; NO-SIMD128-NEXT:    i32.shr_s $push30=, $pop29, $pop28
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop30
+; NO-SIMD128-NEXT:    i32.extend8_s $push32=, $6
+; NO-SIMD128-NEXT:    i32.const $push54=, 255
+; NO-SIMD128-NEXT:    i32.and $push31=, $22, $pop54
 ; NO-SIMD128-NEXT:    i32.shr_s $push33=, $pop32, $pop31
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push39=, 11
-; NO-SIMD128-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $12
-; NO-SIMD128-NEXT:    i32.const $push77=, 255
-; NO-SIMD128-NEXT:    i32.and $push36=, $28, $pop77
-; NO-SIMD128-NEXT:    i32.shr_s $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-NEXT:    i32.const $push44=, 10
-; NO-SIMD128-NEXT:    i32.add $push45=, $0, $pop44
-; NO-SIMD128-NEXT:    i32.extend8_s $push42=, $11
-; NO-SIMD128-NEXT:    i32.const $push76=, 255
-; NO-SIMD128-NEXT:    i32.and $push41=, $27, $pop76
-; NO-SIMD128-NEXT:    i32.shr_s $push43=, $pop42, $pop41
-; NO-SIMD128-NEXT:    i32.store8 0($pop45), $pop43
-; NO-SIMD128-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $10
-; NO-SIMD128-NEXT:    i32.const $push75=, 255
-; NO-SIMD128-NEXT:    i32.and $push46=, $26, $pop75
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop33
+; NO-SIMD128-NEXT:    i32.extend8_s $push35=, $5
+; NO-SIMD128-NEXT:    i32.const $push53=, 255
+; NO-SIMD128-NEXT:    i32.and $push34=, $21, $pop53
+; NO-SIMD128-NEXT:    i32.shr_s $push36=, $pop35, $pop34
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop36
+; NO-SIMD128-NEXT:    i32.extend8_s $push38=, $4
+; NO-SIMD128-NEXT:    i32.const $push52=, 255
+; NO-SIMD128-NEXT:    i32.and $push37=, $20, $pop52
+; NO-SIMD128-NEXT:    i32.shr_s $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop39
+; NO-SIMD128-NEXT:    i32.extend8_s $push41=, $3
+; NO-SIMD128-NEXT:    i32.const $push51=, 255
+; NO-SIMD128-NEXT:    i32.and $push40=, $19, $pop51
+; NO-SIMD128-NEXT:    i32.shr_s $push42=, $pop41, $pop40
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop42
+; NO-SIMD128-NEXT:    i32.extend8_s $push44=, $2
+; NO-SIMD128-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-NEXT:    i32.and $push43=, $18, $pop50
+; NO-SIMD128-NEXT:    i32.shr_s $push45=, $pop44, $pop43
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop45
+; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $1
+; NO-SIMD128-NEXT:    i32.const $push49=, 255
+; NO-SIMD128-NEXT:    i32.and $push46=, $17, $pop49
 ; NO-SIMD128-NEXT:    i32.shr_s $push48=, $pop47, $pop46
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push54=, 7
-; NO-SIMD128-NEXT:    i32.add $push55=, $0, $pop54
-; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $8
-; NO-SIMD128-NEXT:    i32.const $push74=, 255
-; NO-SIMD128-NEXT:    i32.and $push51=, $24, $pop74
-; NO-SIMD128-NEXT:    i32.shr_s $push53=, $pop52, $pop51
-; NO-SIMD128-NEXT:    i32.store8 0($pop55), $pop53
-; NO-SIMD128-NEXT:    i32.const $push59=, 6
-; NO-SIMD128-NEXT:    i32.add $push60=, $0, $pop59
-; NO-SIMD128-NEXT:    i32.extend8_s $push57=, $7
-; NO-SIMD128-NEXT:    i32.const $push73=, 255
-; NO-SIMD128-NEXT:    i32.and $push56=, $23, $pop73
-; NO-SIMD128-NEXT:    i32.shr_s $push58=, $pop57, $pop56
-; NO-SIMD128-NEXT:    i32.store8 0($pop60), $pop58
-; NO-SIMD128-NEXT:    i32.const $push64=, 5
-; NO-SIMD128-NEXT:    i32.add $push65=, $0, $pop64
-; NO-SIMD128-NEXT:    i32.extend8_s $push62=, $6
-; NO-SIMD128-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-NEXT:    i32.and $push61=, $22, $pop72
-; NO-SIMD128-NEXT:    i32.shr_s $push63=, $pop62, $pop61
-; NO-SIMD128-NEXT:    i32.store8 0($pop65), $pop63
-; NO-SIMD128-NEXT:    i32.const $push69=, 3
-; NO-SIMD128-NEXT:    i32.add $push70=, $0, $pop69
-; NO-SIMD128-NEXT:    i32.extend8_s $push67=, $4
-; NO-SIMD128-NEXT:    i32.const $push71=, 255
-; NO-SIMD128-NEXT:    i32.and $push66=, $20, $pop71
-; NO-SIMD128-NEXT:    i32.shr_s $push68=, $pop67, $pop66
-; NO-SIMD128-NEXT:    i32.store8 0($pop70), $pop68
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop48
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_vec_v16i8:
@@ -3924,102 +3242,80 @@ define <16 x i8> @shr_s_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push5=, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop63
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push6=, $pop5, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push8=, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop62
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push9=, $pop8, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $20, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push14=, $pop13, $pop12
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop14
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $21, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $22, $pop81
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push22=, $pop21, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop19), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push26=, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $23, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $4
+; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $20, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push14=, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $21, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push17=, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $22, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push20=, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $23, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $24, $pop57
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push26=, $9
+; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $25, $pop56
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push27=, $pop26, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push31=, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $24, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push32=, $pop31, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop29), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push34=, $9
-; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $25, $pop78
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push35=, $pop34, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push39=, $10
-; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $26, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push40=, $pop39, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop40
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $11
-; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $27, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $10
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $26, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push30=, $pop29, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push32=, $11
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $27, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push33=, $pop32, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop33
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push35=, $12
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $28, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push36=, $pop35, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push38=, $13
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $29, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop39
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $14
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $30, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push42=, $pop41, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop42
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $15
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $31, $pop50
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push45=, $pop44, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop42), $pop45
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $0, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push49=, $12
-; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push48=, $28, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push50=, $pop49, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop47), $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push54=, $13
-; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $29, $pop74
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push55=, $pop54, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop52), $pop55
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push57=, $0, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push59=, $14
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $30, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push60=, $pop59, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop57), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push64=, $15
-; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $31, $pop72
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push65=, $pop64, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop65
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $0, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push69=, $16
-; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $32, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push70=, $pop69, $pop68
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop67), $pop70
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push47=, $16
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $32, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push48=, $pop47, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop48
 ; NO-SIMD128-FAST-NEXT:    return
   %a = ashr <16 x i8> %v, %x
   ret <16 x i8> %a
@@ -4042,94 +3338,72 @@ define <16 x i8> @shr_u_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128:         .functype shr_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $9, $pop0
-; NO-SIMD128-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-NEXT:    i32.and $push71=, $17, $pop72
-; NO-SIMD128-NEXT:    local.tee $push70=, $17=, $pop71
-; NO-SIMD128-NEXT:    i32.shr_u $push2=, $pop1, $pop70
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push69=, 255
-; NO-SIMD128-NEXT:    i32.and $push3=, $5, $pop69
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-NEXT:    i32.and $push49=, $17, $pop50
+; NO-SIMD128-NEXT:    local.tee $push48=, $17=, $pop49
+; NO-SIMD128-NEXT:    i32.shr_u $push2=, $pop1, $pop48
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push47=, 255
+; NO-SIMD128-NEXT:    i32.and $push3=, $15, $pop47
 ; NO-SIMD128-NEXT:    i32.shr_u $push4=, $pop3, $17
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push68=, 255
-; NO-SIMD128-NEXT:    i32.and $push5=, $3, $pop68
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push46=, 255
+; NO-SIMD128-NEXT:    i32.and $push5=, $14, $pop46
 ; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $17
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push67=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $2, $pop67
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push45=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $13, $pop45
 ; NO-SIMD128-NEXT:    i32.shr_u $push8=, $pop7, $17
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push66=, 255
-; NO-SIMD128-NEXT:    i32.and $push9=, $1, $pop66
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push44=, 255
+; NO-SIMD128-NEXT:    i32.and $push9=, $12, $pop44
 ; NO-SIMD128-NEXT:    i32.shr_u $push10=, $pop9, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop10
-; NO-SIMD128-NEXT:    i32.const $push13=, 15
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push65=, 255
-; NO-SIMD128-NEXT:    i32.and $push11=, $16, $pop65
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push43=, 255
+; NO-SIMD128-NEXT:    i32.and $push11=, $11, $pop43
 ; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push17=, 14
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push64=, 255
-; NO-SIMD128-NEXT:    i32.and $push15=, $15, $pop64
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push42=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $10, $pop42
+; NO-SIMD128-NEXT:    i32.shr_u $push14=, $pop13, $17
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push41=, 255
+; NO-SIMD128-NEXT:    i32.and $push15=, $9, $pop41
 ; NO-SIMD128-NEXT:    i32.shr_u $push16=, $pop15, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push21=, 13
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.const $push63=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $14, $pop63
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $8, $pop40
+; NO-SIMD128-NEXT:    i32.shr_u $push18=, $pop17, $17
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $7, $pop39
 ; NO-SIMD128-NEXT:    i32.shr_u $push20=, $pop19, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push25=, 12
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push62=, 255
-; NO-SIMD128-NEXT:    i32.and $push23=, $13, $pop62
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push38=, 255
+; NO-SIMD128-NEXT:    i32.and $push21=, $6, $pop38
+; NO-SIMD128-NEXT:    i32.shr_u $push22=, $pop21, $17
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop22
+; NO-SIMD128-NEXT:    i32.const $push37=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $5, $pop37
 ; NO-SIMD128-NEXT:    i32.shr_u $push24=, $pop23, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push61=, 255
-; NO-SIMD128-NEXT:    i32.and $push27=, $12, $pop61
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push36=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $4, $pop36
+; NO-SIMD128-NEXT:    i32.shr_u $push26=, $pop25, $17
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop26
+; NO-SIMD128-NEXT:    i32.const $push35=, 255
+; NO-SIMD128-NEXT:    i32.and $push27=, $3, $pop35
 ; NO-SIMD128-NEXT:    i32.shr_u $push28=, $pop27, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push33=, 10
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.const $push60=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $11, $pop60
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, 255
+; NO-SIMD128-NEXT:    i32.and $push29=, $2, $pop34
+; NO-SIMD128-NEXT:    i32.shr_u $push30=, $pop29, $17
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop30
+; NO-SIMD128-NEXT:    i32.const $push33=, 255
+; NO-SIMD128-NEXT:    i32.and $push31=, $1, $pop33
 ; NO-SIMD128-NEXT:    i32.shr_u $push32=, $pop31, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push37=, 9
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.const $push59=, 255
-; NO-SIMD128-NEXT:    i32.and $push35=, $10, $pop59
-; NO-SIMD128-NEXT:    i32.shr_u $push36=, $pop35, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
-; NO-SIMD128-NEXT:    i32.const $push41=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.const $push58=, 255
-; NO-SIMD128-NEXT:    i32.and $push39=, $8, $pop58
-; NO-SIMD128-NEXT:    i32.shr_u $push40=, $pop39, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push45=, 6
-; NO-SIMD128-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-NEXT:    i32.const $push57=, 255
-; NO-SIMD128-NEXT:    i32.and $push43=, $7, $pop57
-; NO-SIMD128-NEXT:    i32.shr_u $push44=, $pop43, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop46), $pop44
-; NO-SIMD128-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.const $push56=, 255
-; NO-SIMD128-NEXT:    i32.and $push47=, $6, $pop56
-; NO-SIMD128-NEXT:    i32.shr_u $push48=, $pop47, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push53=, 3
-; NO-SIMD128-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-NEXT:    i32.const $push55=, 255
-; NO-SIMD128-NEXT:    i32.and $push51=, $4, $pop55
-; NO-SIMD128-NEXT:    i32.shr_u $push52=, $pop51, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_v16i8:
@@ -4137,93 +3411,71 @@ define <16 x i8> @shr_u_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push71=, $17, $pop72
-; NO-SIMD128-FAST-NEXT:    local.tee $push70=, $1=, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $pop1, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $17, $pop50
+; NO-SIMD128-FAST-NEXT:    local.tee $push48=, $1=, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $pop1, $pop48
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $2, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $2, $pop47
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push4=, $pop3, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $3, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $3, $pop46
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $4, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $4, $pop45
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push8=, $pop7, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $5, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $5, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push10=, $pop9, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $6, $pop43
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $6, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $7, $pop42
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push14=, $pop13, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $7, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $8, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push16=, $pop15, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $9, $pop40
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push18=, $pop17, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $8, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $10, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push20=, $pop19, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $11, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push22=, $pop21, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $9, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $12, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push24=, $pop23, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $13, $pop36
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push26=, $pop25, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $10, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $14, $pop35
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push28=, $pop27, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $11, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $15, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push30=, $pop29, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $16, $pop33
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push32=, $pop31, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $12, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push36=, $pop35, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop38), $pop36
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push39=, $13, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push40=, $pop39, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $14, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push44=, $pop43, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop46), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $15, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push48=, $pop47, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $16, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push52=, $pop51, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <16 x i8> undef, i8 %x, i32 0
   %s = shufflevector <16 x i8> %t, <16 x i8> undef,
@@ -4440,123 +3692,101 @@ define <16 x i8> @shr_u_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128:         .functype shr_u_vec_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push2=, $9, $pop0
-; NO-SIMD128-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $25, $pop101
-; NO-SIMD128-NEXT:    i32.shr_u $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-NEXT:    i32.and $push5=, $5, $pop100
-; NO-SIMD128-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-NEXT:    i32.and $push4=, $21, $pop99
-; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-NEXT:    i32.and $push8=, $3, $pop98
-; NO-SIMD128-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $19, $pop97
-; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-NEXT:    i32.and $push11=, $2, $pop96
-; NO-SIMD128-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-NEXT:    i32.and $push10=, $18, $pop95
-; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-NEXT:    i32.and $push14=, $1, $pop94
-; NO-SIMD128-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $17, $pop93
-; NO-SIMD128-NEXT:    i32.shr_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 15
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-NEXT:    i32.and $push17=, $16, $pop92
-; NO-SIMD128-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-NEXT:    i32.and $push16=, $32, $pop91
-; NO-SIMD128-NEXT:    i32.shr_u $push18=, $pop17, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push24=, 14
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-NEXT:    i32.and $push22=, $15, $pop90
-; NO-SIMD128-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-NEXT:    i32.and $push21=, $31, $pop89
-; NO-SIMD128-NEXT:    i32.shr_u $push23=, $pop22, $pop21
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-NEXT:    i32.and $push27=, $14, $pop88
-; NO-SIMD128-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-NEXT:    i32.and $push26=, $30, $pop87
-; NO-SIMD128-NEXT:    i32.shr_u $push28=, $pop27, $pop26
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push34=, 12
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.const $push86=, 255
-; NO-SIMD128-NEXT:    i32.and $push32=, $13, $pop86
-; NO-SIMD128-NEXT:    i32.const $push85=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $29, $pop85
-; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop31
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push39=, 11
-; NO-SIMD128-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-NEXT:    i32.const $push84=, 255
-; NO-SIMD128-NEXT:    i32.and $push37=, $12, $pop84
-; NO-SIMD128-NEXT:    i32.const $push83=, 255
-; NO-SIMD128-NEXT:    i32.and $push36=, $28, $pop83
-; NO-SIMD128-NEXT:    i32.shr_u $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-NEXT:    i32.const $push44=, 10
-; NO-SIMD128-NEXT:    i32.add $push45=, $0, $pop44
-; NO-SIMD128-NEXT:    i32.const $push82=, 255
-; NO-SIMD128-NEXT:    i32.and $push42=, $11, $pop82
-; NO-SIMD128-NEXT:    i32.const $push81=, 255
-; NO-SIMD128-NEXT:    i32.and $push41=, $27, $pop81
-; NO-SIMD128-NEXT:    i32.shr_u $push43=, $pop42, $pop41
-; NO-SIMD128-NEXT:    i32.store8 0($pop45), $pop43
-; NO-SIMD128-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.const $push80=, 255
-; NO-SIMD128-NEXT:    i32.and $push47=, $10, $pop80
+; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push79=, 255
-; NO-SIMD128-NEXT:    i32.and $push46=, $26, $pop79
-; NO-SIMD128-NEXT:    i32.shr_u $push48=, $pop47, $pop46
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push54=, 7
-; NO-SIMD128-NEXT:    i32.add $push55=, $0, $pop54
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop79
+; NO-SIMD128-NEXT:    i32.shr_u $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push78=, 255
-; NO-SIMD128-NEXT:    i32.and $push52=, $8, $pop78
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $pop78
 ; NO-SIMD128-NEXT:    i32.const $push77=, 255
-; NO-SIMD128-NEXT:    i32.and $push51=, $24, $pop77
-; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop51
-; NO-SIMD128-NEXT:    i32.store8 0($pop55), $pop53
-; NO-SIMD128-NEXT:    i32.const $push59=, 6
-; NO-SIMD128-NEXT:    i32.add $push60=, $0, $pop59
+; NO-SIMD128-NEXT:    i32.and $push4=, $31, $pop77
+; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push76=, 255
-; NO-SIMD128-NEXT:    i32.and $push57=, $7, $pop76
+; NO-SIMD128-NEXT:    i32.and $push8=, $14, $pop76
 ; NO-SIMD128-NEXT:    i32.const $push75=, 255
-; NO-SIMD128-NEXT:    i32.and $push56=, $23, $pop75
-; NO-SIMD128-NEXT:    i32.shr_u $push58=, $pop57, $pop56
-; NO-SIMD128-NEXT:    i32.store8 0($pop60), $pop58
-; NO-SIMD128-NEXT:    i32.const $push64=, 5
-; NO-SIMD128-NEXT:    i32.add $push65=, $0, $pop64
+; NO-SIMD128-NEXT:    i32.and $push7=, $30, $pop75
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop9
 ; NO-SIMD128-NEXT:    i32.const $push74=, 255
-; NO-SIMD128-NEXT:    i32.and $push62=, $6, $pop74
+; NO-SIMD128-NEXT:    i32.and $push11=, $13, $pop74
 ; NO-SIMD128-NEXT:    i32.const $push73=, 255
-; NO-SIMD128-NEXT:    i32.and $push61=, $22, $pop73
-; NO-SIMD128-NEXT:    i32.shr_u $push63=, $pop62, $pop61
-; NO-SIMD128-NEXT:    i32.store8 0($pop65), $pop63
-; NO-SIMD128-NEXT:    i32.const $push69=, 3
-; NO-SIMD128-NEXT:    i32.add $push70=, $0, $pop69
+; NO-SIMD128-NEXT:    i32.and $push10=, $29, $pop73
+; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop12
 ; NO-SIMD128-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-NEXT:    i32.and $push67=, $4, $pop72
+; NO-SIMD128-NEXT:    i32.and $push14=, $12, $pop72
 ; NO-SIMD128-NEXT:    i32.const $push71=, 255
-; NO-SIMD128-NEXT:    i32.and $push66=, $20, $pop71
-; NO-SIMD128-NEXT:    i32.shr_u $push68=, $pop67, $pop66
-; NO-SIMD128-NEXT:    i32.store8 0($pop70), $pop68
+; NO-SIMD128-NEXT:    i32.and $push13=, $28, $pop71
+; NO-SIMD128-NEXT:    i32.shr_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $11, $pop70
+; NO-SIMD128-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-NEXT:    i32.and $push16=, $27, $pop69
+; NO-SIMD128-NEXT:    i32.shr_u $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-NEXT:    i32.and $push20=, $10, $pop68
+; NO-SIMD128-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $26, $pop67
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop21
+; NO-SIMD128-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $9, $pop66
+; NO-SIMD128-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $25, $pop65
+; NO-SIMD128-NEXT:    i32.shr_u $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push64=, 255
+; NO-SIMD128-NEXT:    i32.and $push26=, $8, $pop64
+; NO-SIMD128-NEXT:    i32.const $push63=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $24, $pop63
+; NO-SIMD128-NEXT:    i32.shr_u $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop27
+; NO-SIMD128-NEXT:    i32.const $push62=, 255
+; NO-SIMD128-NEXT:    i32.and $push29=, $7, $pop62
+; NO-SIMD128-NEXT:    i32.const $push61=, 255
+; NO-SIMD128-NEXT:    i32.and $push28=, $23, $pop61
+; NO-SIMD128-NEXT:    i32.shr_u $push30=, $pop29, $pop28
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop30
+; NO-SIMD128-NEXT:    i32.const $push60=, 255
+; NO-SIMD128-NEXT:    i32.and $push32=, $6, $pop60
+; NO-SIMD128-NEXT:    i32.const $push59=, 255
+; NO-SIMD128-NEXT:    i32.and $push31=, $22, $pop59
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop31
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop33
+; NO-SIMD128-NEXT:    i32.const $push58=, 255
+; NO-SIMD128-NEXT:    i32.and $push35=, $5, $pop58
+; NO-SIMD128-NEXT:    i32.const $push57=, 255
+; NO-SIMD128-NEXT:    i32.and $push34=, $21, $pop57
+; NO-SIMD128-NEXT:    i32.shr_u $push36=, $pop35, $pop34
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop36
+; NO-SIMD128-NEXT:    i32.const $push56=, 255
+; NO-SIMD128-NEXT:    i32.and $push38=, $4, $pop56
+; NO-SIMD128-NEXT:    i32.const $push55=, 255
+; NO-SIMD128-NEXT:    i32.and $push37=, $20, $pop55
+; NO-SIMD128-NEXT:    i32.shr_u $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop39
+; NO-SIMD128-NEXT:    i32.const $push54=, 255
+; NO-SIMD128-NEXT:    i32.and $push41=, $3, $pop54
+; NO-SIMD128-NEXT:    i32.const $push53=, 255
+; NO-SIMD128-NEXT:    i32.and $push40=, $19, $pop53
+; NO-SIMD128-NEXT:    i32.shr_u $push42=, $pop41, $pop40
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop42
+; NO-SIMD128-NEXT:    i32.const $push52=, 255
+; NO-SIMD128-NEXT:    i32.and $push44=, $2, $pop52
+; NO-SIMD128-NEXT:    i32.const $push51=, 255
+; NO-SIMD128-NEXT:    i32.and $push43=, $18, $pop51
+; NO-SIMD128-NEXT:    i32.shr_u $push45=, $pop44, $pop43
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop45
+; NO-SIMD128-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-NEXT:    i32.and $push47=, $1, $pop50
+; NO-SIMD128-NEXT:    i32.const $push49=, 255
+; NO-SIMD128-NEXT:    i32.and $push46=, $17, $pop49
+; NO-SIMD128-NEXT:    i32.shr_u $push48=, $pop47, $pop46
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop48
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_vec_v16i8:
@@ -4564,122 +3794,100 @@ define <16 x i8> @shr_u_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop79
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop100
-; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop99
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $20, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $5, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $21, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $6, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $22, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $7, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $23, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $24, $pop87
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop32), $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $9, $pop86
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $25, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $10, $pop84
-; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $26, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push38=, $pop37, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push45=, $0, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $11, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $27, $pop81
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push43=, $pop42, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop45), $pop43
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $12, $pop80
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $28, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push48=, $pop47, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $0, $pop54
 ; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $13, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop78
 ; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $29, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push53=, $pop52, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop55), $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push60=, $0, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $14, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop76
 ; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $30, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push58=, $pop57, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop60), $pop58
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push65=, $0, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $15, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop74
 ; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push61=, $31, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push63=, $pop62, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop65), $pop63
-; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push70=, $0, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $20, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push67=, $16, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $5, $pop72
 ; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push66=, $32, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push68=, $pop67, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop70), $pop68
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $21, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $6, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $22, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $7, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $23, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $8, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $24, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $9, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $25, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $10, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $26, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push30=, $pop29, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $11, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $27, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push33=, $pop32, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop33
+; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $12, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $28, $pop57
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push36=, $pop35, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $13, $pop56
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $29, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $14, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $30, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push42=, $pop41, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push44=, $15, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $31, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $16, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $32, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push48=, $pop47, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop48
 ; NO-SIMD128-FAST-NEXT:    return
   %a = lshr <16 x i8> %v, %x
   ret <16 x i8> %a
@@ -4701,60 +3909,38 @@ define <16 x i8> @and_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: and_v16i8:
 ; NO-SIMD128:         .functype and_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.and $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.and $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.and $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.and $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.and $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.and $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.and $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.and $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.and $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.and $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.and $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.and $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.and $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.and $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.and $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.and $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.and $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.and $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.and $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.and $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.and $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.and $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.and $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.and $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.and $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.and $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.and $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.and $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.and $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.and $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.and $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: and_v16i8:
@@ -4766,54 +3952,32 @@ define <16 x i8> @and_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = and <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -4835,60 +3999,38 @@ define <16 x i8> @or_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: or_v16i8:
 ; NO-SIMD128:         .functype or_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.or $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.or $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.or $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.or $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.or $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.or $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.or $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.or $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.or $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.or $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.or $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.or $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.or $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.or $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.or $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.or $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.or $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.or $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.or $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.or $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.or $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.or $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.or $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.or $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.or $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.or $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.or $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.or $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.or $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.or $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.or $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.or $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: or_v16i8:
@@ -4900,54 +4042,32 @@ define <16 x i8> @or_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.or $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.or $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.or $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.or $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.or $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.or $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.or $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.or $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.or $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.or $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.or $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.or $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.or $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.or $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.or $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.or $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.or $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.or $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.or $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = or <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -4969,60 +4089,38 @@ define <16 x i8> @xor_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: xor_v16i8:
 ; NO-SIMD128:         .functype xor_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.xor $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.xor $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.xor $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.xor $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.xor $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.xor $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.xor $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.xor $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.xor $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.xor $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.xor $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.xor $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.xor $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.xor $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.xor $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.xor $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.xor $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.xor $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.xor $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.xor $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.xor $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.xor $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.xor $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.xor $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.xor $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: xor_v16i8:
@@ -5034,54 +4132,32 @@ define <16 x i8> @xor_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.xor $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.xor $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.xor $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.xor $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -5104,75 +4180,53 @@ define <16 x i8> @not_v16i8(<16 x i8> %x) {
 ; NO-SIMD128:         .functype not_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $9, $pop0
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push53=, -1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $5, $pop53
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push52=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $3, $pop52
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push51=, -1
-; NO-SIMD128-NEXT:    i32.xor $push4=, $2, $pop51
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push50=, -1
-; NO-SIMD128-NEXT:    i32.xor $push5=, $1, $pop50
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push7=, 15
-; NO-SIMD128-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-NEXT:    i32.const $push49=, -1
-; NO-SIMD128-NEXT:    i32.xor $push6=, $16, $pop49
-; NO-SIMD128-NEXT:    i32.store8 0($pop8), $pop6
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.const $push48=, -1
-; NO-SIMD128-NEXT:    i32.xor $push9=, $15, $pop48
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 13
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-NEXT:    i32.xor $push12=, $14, $pop47
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-NEXT:    i32.xor $push15=, $13, $pop46
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 11
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $pop45
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push22=, 10
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-NEXT:    i32.xor $push21=, $11, $pop44
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push25=, 9
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-NEXT:    i32.xor $push24=, $10, $pop43
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push28=, 7
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-NEXT:    i32.xor $push27=, $8, $pop42
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-NEXT:    i32.xor $push30=, $7, $pop41
-; NO-SIMD128-NEXT:    i32.store8 0($pop32), $pop30
-; NO-SIMD128-NEXT:    i32.const $push34=, 5
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.const $push40=, -1
-; NO-SIMD128-NEXT:    i32.xor $push33=, $6, $pop40
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push37=, 3
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.const $push39=, -1
-; NO-SIMD128-NEXT:    i32.xor $push36=, $4, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
+; NO-SIMD128-NEXT:    i32.xor $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push31=, -1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $15, $pop31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push30=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $14, $pop30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push29=, -1
+; NO-SIMD128-NEXT:    i32.xor $push4=, $13, $pop29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push28=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $12, $pop28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push27=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $11, $pop27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push26=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $10, $pop26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push25=, -1
+; NO-SIMD128-NEXT:    i32.xor $push8=, $9, $pop25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push24=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $8, $pop24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push23=, -1
+; NO-SIMD128-NEXT:    i32.xor $push10=, $7, $pop23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push22=, -1
+; NO-SIMD128-NEXT:    i32.xor $push11=, $6, $pop22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop11
+; NO-SIMD128-NEXT:    i32.const $push21=, -1
+; NO-SIMD128-NEXT:    i32.xor $push12=, $5, $pop21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push20=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $4, $pop20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop13
+; NO-SIMD128-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $3, $pop19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-NEXT:    i32.xor $push15=, $2, $pop18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-NEXT:    i32.xor $push16=, $1, $pop17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: not_v16i8:
@@ -5181,73 +4235,51 @@ define <16 x i8> @not_v16i8(<16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop31
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop30
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $4, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $5, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $6, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $7, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $8, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $9, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $10, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop19), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $11, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $12, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $13, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $14, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $15, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $16, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $4, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $5, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $6, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $7, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $8, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push24=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $9, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $10, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $11, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $12, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $13, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $14, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $15, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $16, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <16 x i8> %x, <i8 -1, i8 -1, i8 -1, i8 -1,
                           i8 -1, i8 -1, i8 -1, i8 -1,
@@ -5274,91 +4306,69 @@ define <16 x i8> @andnot_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128:         .functype andnot_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $25, $pop0
-; NO-SIMD128-NEXT:    i32.and $push2=, $9, $pop1
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push69=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $21, $pop69
-; NO-SIMD128-NEXT:    i32.and $push4=, $5, $pop3
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push68=, -1
-; NO-SIMD128-NEXT:    i32.xor $push5=, $19, $pop68
-; NO-SIMD128-NEXT:    i32.and $push6=, $3, $pop5
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push67=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $18, $pop67
-; NO-SIMD128-NEXT:    i32.and $push8=, $2, $pop7
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push66=, -1
-; NO-SIMD128-NEXT:    i32.xor $push9=, $17, $pop66
-; NO-SIMD128-NEXT:    i32.and $push10=, $1, $pop9
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop10
-; NO-SIMD128-NEXT:    i32.const $push13=, 15
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push65=, -1
-; NO-SIMD128-NEXT:    i32.xor $push11=, $32, $pop65
-; NO-SIMD128-NEXT:    i32.and $push12=, $16, $pop11
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push17=, 14
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push64=, -1
-; NO-SIMD128-NEXT:    i32.xor $push15=, $31, $pop64
-; NO-SIMD128-NEXT:    i32.and $push16=, $15, $pop15
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push21=, 13
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.const $push63=, -1
-; NO-SIMD128-NEXT:    i32.xor $push19=, $30, $pop63
-; NO-SIMD128-NEXT:    i32.and $push20=, $14, $pop19
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push25=, 12
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push62=, -1
-; NO-SIMD128-NEXT:    i32.xor $push23=, $29, $pop62
-; NO-SIMD128-NEXT:    i32.and $push24=, $13, $pop23
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push61=, -1
-; NO-SIMD128-NEXT:    i32.xor $push27=, $28, $pop61
-; NO-SIMD128-NEXT:    i32.and $push28=, $12, $pop27
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push33=, 10
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.const $push60=, -1
-; NO-SIMD128-NEXT:    i32.xor $push31=, $27, $pop60
-; NO-SIMD128-NEXT:    i32.and $push32=, $11, $pop31
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push37=, 9
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.const $push59=, -1
-; NO-SIMD128-NEXT:    i32.xor $push35=, $26, $pop59
-; NO-SIMD128-NEXT:    i32.and $push36=, $10, $pop35
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
-; NO-SIMD128-NEXT:    i32.const $push41=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.const $push58=, -1
-; NO-SIMD128-NEXT:    i32.xor $push39=, $24, $pop58
-; NO-SIMD128-NEXT:    i32.and $push40=, $8, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push45=, 6
-; NO-SIMD128-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-NEXT:    i32.const $push57=, -1
-; NO-SIMD128-NEXT:    i32.xor $push43=, $23, $pop57
-; NO-SIMD128-NEXT:    i32.and $push44=, $7, $pop43
-; NO-SIMD128-NEXT:    i32.store8 0($pop46), $pop44
-; NO-SIMD128-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.const $push56=, -1
-; NO-SIMD128-NEXT:    i32.xor $push47=, $22, $pop56
-; NO-SIMD128-NEXT:    i32.and $push48=, $6, $pop47
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push53=, 3
-; NO-SIMD128-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-NEXT:    i32.const $push55=, -1
-; NO-SIMD128-NEXT:    i32.xor $push51=, $20, $pop55
-; NO-SIMD128-NEXT:    i32.and $push52=, $4, $pop51
-; NO-SIMD128-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-NEXT:    i32.xor $push1=, $32, $pop0
+; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop1
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push47=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $31, $pop47
+; NO-SIMD128-NEXT:    i32.and $push4=, $15, $pop3
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push46=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $30, $pop46
+; NO-SIMD128-NEXT:    i32.and $push6=, $14, $pop5
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push45=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $29, $pop45
+; NO-SIMD128-NEXT:    i32.and $push8=, $13, $pop7
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push44=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $28, $pop44
+; NO-SIMD128-NEXT:    i32.and $push10=, $12, $pop9
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push43=, -1
+; NO-SIMD128-NEXT:    i32.xor $push11=, $27, $pop43
+; NO-SIMD128-NEXT:    i32.and $push12=, $11, $pop11
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push42=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $26, $pop42
+; NO-SIMD128-NEXT:    i32.and $push14=, $10, $pop13
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push41=, -1
+; NO-SIMD128-NEXT:    i32.xor $push15=, $25, $pop41
+; NO-SIMD128-NEXT:    i32.and $push16=, $9, $pop15
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, -1
+; NO-SIMD128-NEXT:    i32.xor $push17=, $24, $pop40
+; NO-SIMD128-NEXT:    i32.and $push18=, $8, $pop17
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-NEXT:    i32.xor $push19=, $23, $pop39
+; NO-SIMD128-NEXT:    i32.and $push20=, $7, $pop19
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-NEXT:    i32.xor $push21=, $22, $pop38
+; NO-SIMD128-NEXT:    i32.and $push22=, $6, $pop21
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop22
+; NO-SIMD128-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-NEXT:    i32.xor $push23=, $21, $pop37
+; NO-SIMD128-NEXT:    i32.and $push24=, $5, $pop23
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-NEXT:    i32.xor $push25=, $20, $pop36
+; NO-SIMD128-NEXT:    i32.and $push26=, $4, $pop25
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop26
+; NO-SIMD128-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-NEXT:    i32.xor $push27=, $19, $pop35
+; NO-SIMD128-NEXT:    i32.and $push28=, $3, $pop27
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-NEXT:    i32.xor $push29=, $18, $pop34
+; NO-SIMD128-NEXT:    i32.and $push30=, $2, $pop29
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop30
+; NO-SIMD128-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-NEXT:    i32.xor $push31=, $17, $pop33
+; NO-SIMD128-NEXT:    i32.and $push32=, $1, $pop31
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: andnot_v16i8:
@@ -5368,88 +4378,66 @@ define <16 x i8> @andnot_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $17, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push69=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $18, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $18, $pop47
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $2, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push68=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $19, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $19, $pop46
 ; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $3, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $20, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $4, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $21, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $5, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push65=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $22, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $6, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $23, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $7, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $24, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $8, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop24
-; NO-SIMD128-FAST-NEXT:    i32.const $push62=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push25=, $25, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $9, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $26, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $10, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $27, $pop60
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $11, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop32), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push37=, $28, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $12, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $29, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $13, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $0, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.const $push57=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $30, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $14, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop44), $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $0, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push49=, $31, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $15, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop48), $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push53=, $32, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.and $push54=, $16, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop52), $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $20, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $21, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $5, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $22, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $23, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $7, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $24, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $8, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $25, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $9, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $10, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $27, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $11, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $28, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $12, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push25=, $29, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $13, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push27=, $30, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $14, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $31, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $15, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push31=, $32, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $16, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
  %inv_y = xor <16 x i8> %y,
    <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
@@ -5477,124 +4465,102 @@ define <16 x i8> @bitselect_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v16i8:
 ; NO-SIMD128:         .functype bitselect_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 15
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.and $push0=, $16, $32
 ; NO-SIMD128-NEXT:    i32.const $push1=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $pop1
 ; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $48
 ; NO-SIMD128-NEXT:    i32.or $push4=, $pop0, $pop3
-; NO-SIMD128-NEXT:    i32.store8 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.and $push7=, $15, $31
-; NO-SIMD128-NEXT:    i32.const $push101=, -1
-; NO-SIMD128-NEXT:    i32.xor $push8=, $15, $pop101
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $47
-; NO-SIMD128-NEXT:    i32.or $push10=, $pop7, $pop9
-; NO-SIMD128-NEXT:    i32.store8 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 13
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.and $push13=, $14, $30
-; NO-SIMD128-NEXT:    i32.const $push100=, -1
-; NO-SIMD128-NEXT:    i32.xor $push14=, $14, $pop100
-; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $46
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop4
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $31
+; NO-SIMD128-NEXT:    i32.const $push79=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $15, $pop79
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $47
+; NO-SIMD128-NEXT:    i32.or $push8=, $pop5, $pop7
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop8
+; NO-SIMD128-NEXT:    i32.and $push9=, $14, $30
+; NO-SIMD128-NEXT:    i32.const $push78=, -1
+; NO-SIMD128-NEXT:    i32.xor $push10=, $14, $pop78
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $46
+; NO-SIMD128-NEXT:    i32.or $push12=, $pop9, $pop11
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop12
+; NO-SIMD128-NEXT:    i32.and $push13=, $13, $29
+; NO-SIMD128-NEXT:    i32.const $push77=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $13, $pop77
+; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $45
 ; NO-SIMD128-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push23=, 12
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.and $push19=, $13, $29
-; NO-SIMD128-NEXT:    i32.const $push99=, -1
-; NO-SIMD128-NEXT:    i32.xor $push20=, $13, $pop99
-; NO-SIMD128-NEXT:    i32.and $push21=, $pop20, $45
-; NO-SIMD128-NEXT:    i32.or $push22=, $pop19, $pop21
-; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.and $push25=, $12, $28
-; NO-SIMD128-NEXT:    i32.const $push98=, -1
-; NO-SIMD128-NEXT:    i32.xor $push26=, $12, $pop98
-; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $44
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop16
+; NO-SIMD128-NEXT:    i32.and $push17=, $12, $28
+; NO-SIMD128-NEXT:    i32.const $push76=, -1
+; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $pop76
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $44
+; NO-SIMD128-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop20
+; NO-SIMD128-NEXT:    i32.and $push21=, $11, $27
+; NO-SIMD128-NEXT:    i32.const $push75=, -1
+; NO-SIMD128-NEXT:    i32.xor $push22=, $11, $pop75
+; NO-SIMD128-NEXT:    i32.and $push23=, $pop22, $43
+; NO-SIMD128-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop24
+; NO-SIMD128-NEXT:    i32.and $push25=, $10, $26
+; NO-SIMD128-NEXT:    i32.const $push74=, -1
+; NO-SIMD128-NEXT:    i32.xor $push26=, $10, $pop74
+; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $42
 ; NO-SIMD128-NEXT:    i32.or $push28=, $pop25, $pop27
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push35=, 10
-; NO-SIMD128-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-NEXT:    i32.and $push31=, $11, $27
-; NO-SIMD128-NEXT:    i32.const $push97=, -1
-; NO-SIMD128-NEXT:    i32.xor $push32=, $11, $pop97
-; NO-SIMD128-NEXT:    i32.and $push33=, $pop32, $43
-; NO-SIMD128-NEXT:    i32.or $push34=, $pop31, $pop33
-; NO-SIMD128-NEXT:    i32.store8 0($pop36), $pop34
-; NO-SIMD128-NEXT:    i32.const $push41=, 9
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.and $push37=, $10, $26
-; NO-SIMD128-NEXT:    i32.const $push96=, -1
-; NO-SIMD128-NEXT:    i32.xor $push38=, $10, $pop96
-; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $42
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop28
+; NO-SIMD128-NEXT:    i32.and $push29=, $9, $25
+; NO-SIMD128-NEXT:    i32.const $push73=, -1
+; NO-SIMD128-NEXT:    i32.xor $push30=, $9, $pop73
+; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $41
+; NO-SIMD128-NEXT:    i32.or $push32=, $pop29, $pop31
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop32
+; NO-SIMD128-NEXT:    i32.and $push33=, $8, $24
+; NO-SIMD128-NEXT:    i32.const $push72=, -1
+; NO-SIMD128-NEXT:    i32.xor $push34=, $8, $pop72
+; NO-SIMD128-NEXT:    i32.and $push35=, $pop34, $40
+; NO-SIMD128-NEXT:    i32.or $push36=, $pop33, $pop35
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop36
+; NO-SIMD128-NEXT:    i32.and $push37=, $7, $23
+; NO-SIMD128-NEXT:    i32.const $push71=, -1
+; NO-SIMD128-NEXT:    i32.xor $push38=, $7, $pop71
+; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $39
 ; NO-SIMD128-NEXT:    i32.or $push40=, $pop37, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.and $push43=, $9, $25
-; NO-SIMD128-NEXT:    i32.const $push95=, -1
-; NO-SIMD128-NEXT:    i32.xor $push44=, $9, $pop95
-; NO-SIMD128-NEXT:    i32.and $push45=, $pop44, $41
-; NO-SIMD128-NEXT:    i32.or $push46=, $pop43, $pop45
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop46
-; NO-SIMD128-NEXT:    i32.const $push51=, 7
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.and $push47=, $8, $24
-; NO-SIMD128-NEXT:    i32.const $push94=, -1
-; NO-SIMD128-NEXT:    i32.xor $push48=, $8, $pop94
-; NO-SIMD128-NEXT:    i32.and $push49=, $pop48, $40
-; NO-SIMD128-NEXT:    i32.or $push50=, $pop47, $pop49
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.const $push57=, 6
-; NO-SIMD128-NEXT:    i32.add $push58=, $0, $pop57
-; NO-SIMD128-NEXT:    i32.and $push53=, $7, $23
-; NO-SIMD128-NEXT:    i32.const $push93=, -1
-; NO-SIMD128-NEXT:    i32.xor $push54=, $7, $pop93
-; NO-SIMD128-NEXT:    i32.and $push55=, $pop54, $39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop40
+; NO-SIMD128-NEXT:    i32.and $push41=, $6, $22
+; NO-SIMD128-NEXT:    i32.const $push70=, -1
+; NO-SIMD128-NEXT:    i32.xor $push42=, $6, $pop70
+; NO-SIMD128-NEXT:    i32.and $push43=, $pop42, $38
+; NO-SIMD128-NEXT:    i32.or $push44=, $pop41, $pop43
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop44
+; NO-SIMD128-NEXT:    i32.and $push45=, $5, $21
+; NO-SIMD128-NEXT:    i32.const $push69=, -1
+; NO-SIMD128-NEXT:    i32.xor $push46=, $5, $pop69
+; NO-SIMD128-NEXT:    i32.and $push47=, $pop46, $37
+; NO-SIMD128-NEXT:    i32.or $push48=, $pop45, $pop47
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop48
+; NO-SIMD128-NEXT:    i32.and $push49=, $4, $20
+; NO-SIMD128-NEXT:    i32.const $push68=, -1
+; NO-SIMD128-NEXT:    i32.xor $push50=, $4, $pop68
+; NO-SIMD128-NEXT:    i32.and $push51=, $pop50, $36
+; NO-SIMD128-NEXT:    i32.or $push52=, $pop49, $pop51
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop52
+; NO-SIMD128-NEXT:    i32.and $push53=, $3, $19
+; NO-SIMD128-NEXT:    i32.const $push67=, -1
+; NO-SIMD128-NEXT:    i32.xor $push54=, $3, $pop67
+; NO-SIMD128-NEXT:    i32.and $push55=, $pop54, $35
 ; NO-SIMD128-NEXT:    i32.or $push56=, $pop53, $pop55
-; NO-SIMD128-NEXT:    i32.store8 0($pop58), $pop56
-; NO-SIMD128-NEXT:    i32.const $push63=, 5
-; NO-SIMD128-NEXT:    i32.add $push64=, $0, $pop63
-; NO-SIMD128-NEXT:    i32.and $push59=, $6, $22
-; NO-SIMD128-NEXT:    i32.const $push92=, -1
-; NO-SIMD128-NEXT:    i32.xor $push60=, $6, $pop92
-; NO-SIMD128-NEXT:    i32.and $push61=, $pop60, $38
-; NO-SIMD128-NEXT:    i32.or $push62=, $pop59, $pop61
-; NO-SIMD128-NEXT:    i32.store8 0($pop64), $pop62
-; NO-SIMD128-NEXT:    i32.and $push65=, $5, $21
-; NO-SIMD128-NEXT:    i32.const $push91=, -1
-; NO-SIMD128-NEXT:    i32.xor $push66=, $5, $pop91
-; NO-SIMD128-NEXT:    i32.and $push67=, $pop66, $37
-; NO-SIMD128-NEXT:    i32.or $push68=, $pop65, $pop67
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop68
-; NO-SIMD128-NEXT:    i32.const $push73=, 3
-; NO-SIMD128-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-NEXT:    i32.and $push69=, $4, $20
-; NO-SIMD128-NEXT:    i32.const $push90=, -1
-; NO-SIMD128-NEXT:    i32.xor $push70=, $4, $pop90
-; NO-SIMD128-NEXT:    i32.and $push71=, $pop70, $36
-; NO-SIMD128-NEXT:    i32.or $push72=, $pop69, $pop71
-; NO-SIMD128-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-NEXT:    i32.and $push75=, $3, $19
-; NO-SIMD128-NEXT:    i32.const $push89=, -1
-; NO-SIMD128-NEXT:    i32.xor $push76=, $3, $pop89
-; NO-SIMD128-NEXT:    i32.and $push77=, $pop76, $35
-; NO-SIMD128-NEXT:    i32.or $push78=, $pop75, $pop77
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop78
-; NO-SIMD128-NEXT:    i32.and $push79=, $2, $18
-; NO-SIMD128-NEXT:    i32.const $push88=, -1
-; NO-SIMD128-NEXT:    i32.xor $push80=, $2, $pop88
-; NO-SIMD128-NEXT:    i32.and $push81=, $pop80, $34
-; NO-SIMD128-NEXT:    i32.or $push82=, $pop79, $pop81
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop82
-; NO-SIMD128-NEXT:    i32.and $push83=, $1, $17
-; NO-SIMD128-NEXT:    i32.const $push87=, -1
-; NO-SIMD128-NEXT:    i32.xor $push84=, $1, $pop87
-; NO-SIMD128-NEXT:    i32.and $push85=, $pop84, $33
-; NO-SIMD128-NEXT:    i32.or $push86=, $pop83, $pop85
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop86
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop56
+; NO-SIMD128-NEXT:    i32.and $push57=, $2, $18
+; NO-SIMD128-NEXT:    i32.const $push66=, -1
+; NO-SIMD128-NEXT:    i32.xor $push58=, $2, $pop66
+; NO-SIMD128-NEXT:    i32.and $push59=, $pop58, $34
+; NO-SIMD128-NEXT:    i32.or $push60=, $pop57, $pop59
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop60
+; NO-SIMD128-NEXT:    i32.and $push61=, $1, $17
+; NO-SIMD128-NEXT:    i32.const $push65=, -1
+; NO-SIMD128-NEXT:    i32.xor $push62=, $1, $pop65
+; NO-SIMD128-NEXT:    i32.and $push63=, $pop62, $33
+; NO-SIMD128-NEXT:    i32.or $push64=, $pop61, $pop63
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop64
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v16i8:
@@ -5607,117 +4573,95 @@ define <16 x i8> @bitselect_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop0, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $18
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop79
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $34
 ; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop5, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $3, $19
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop100
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop78
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $35
 ; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop9, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.const $push99=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop77
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $36
 ; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $5, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $pop20, $37
-; NO-SIMD128-FAST-NEXT:    i32.or $push22=, $pop19, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $6, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $38
-; NO-SIMD128-FAST-NEXT:    i32.or $push26=, $pop23, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $7, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $39
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $5, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $37
+; NO-SIMD128-FAST-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $6, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $38
+; NO-SIMD128-FAST-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $7, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $pop26, $39
+; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $pop25, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $8, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $40
 ; NO-SIMD128-FAST-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $8, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $40
-; NO-SIMD128-FAST-NEXT:    i32.or $push38=, $pop35, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $9, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $41
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $9, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $pop34, $41
+; NO-SIMD128-FAST-NEXT:    i32.or $push36=, $pop33, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $10, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.and $push39=, $pop38, $42
+; NO-SIMD128-FAST-NEXT:    i32.or $push40=, $pop37, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop40
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $11, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $43
 ; NO-SIMD128-FAST-NEXT:    i32.or $push44=, $pop41, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $10, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $pop46, $42
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $12, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $pop46, $44
 ; NO-SIMD128-FAST-NEXT:    i32.or $push48=, $pop45, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push52=, $11, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $pop52, $43
-; NO-SIMD128-FAST-NEXT:    i32.or $push54=, $pop51, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $12, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.and $push59=, $pop58, $44
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push50=, $13, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $pop50, $45
+; NO-SIMD128-FAST-NEXT:    i32.or $push52=, $pop49, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $14, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.and $push55=, $pop54, $46
+; NO-SIMD128-FAST-NEXT:    i32.or $push56=, $pop53, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop56
+; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $15, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.and $push59=, $pop58, $47
 ; NO-SIMD128-FAST-NEXT:    i32.or $push60=, $pop57, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push68=, $0, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push64=, $13, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.and $push65=, $pop64, $45
-; NO-SIMD128-FAST-NEXT:    i32.or $push66=, $pop63, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop68), $pop66
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.and $push69=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push70=, $14, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.and $push71=, $pop70, $46
-; NO-SIMD128-FAST-NEXT:    i32.or $push72=, $pop69, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $0, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.and $push75=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push76=, $15, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.and $push77=, $pop76, $47
-; NO-SIMD128-FAST-NEXT:    i32.or $push78=, $pop75, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop80), $pop78
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push86=, $0, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.and $push81=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.const $push87=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push82=, $16, $pop87
-; NO-SIMD128-FAST-NEXT:    i32.and $push83=, $pop82, $48
-; NO-SIMD128-FAST-NEXT:    i32.or $push84=, $pop81, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop86), $pop84
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.and $push61=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push62=, $16, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $pop62, $48
+; NO-SIMD128-FAST-NEXT:    i32.or $push64=, $pop61, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop64
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <16 x i8> %c, %v1
   %inv_mask = xor <16 x i8> %c,
@@ -5746,92 +4690,70 @@ define <16 x i8> @bitselect_xor_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2
 ; NO-SIMD128-LABEL: bitselect_xor_v16i8:
 ; NO-SIMD128:         .functype bitselect_xor_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push3=, 15
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
 ; NO-SIMD128-NEXT:    i32.xor $push0=, $32, $48
 ; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $16
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $48
-; NO-SIMD128-NEXT:    i32.store8 0($pop4), $pop2
-; NO-SIMD128-NEXT:    i32.const $push8=, 14
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.xor $push5=, $31, $47
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop5, $15
-; NO-SIMD128-NEXT:    i32.xor $push7=, $pop6, $47
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push13=, 13
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.xor $push10=, $30, $46
-; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $14
-; NO-SIMD128-NEXT:    i32.xor $push12=, $pop11, $46
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push18=, 12
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.xor $push15=, $29, $45
-; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $13
-; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $45
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.xor $push20=, $28, $44
-; NO-SIMD128-NEXT:    i32.and $push21=, $pop20, $12
-; NO-SIMD128-NEXT:    i32.xor $push22=, $pop21, $44
-; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-NEXT:    i32.const $push28=, 10
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.xor $push25=, $27, $43
-; NO-SIMD128-NEXT:    i32.and $push26=, $pop25, $11
-; NO-SIMD128-NEXT:    i32.xor $push27=, $pop26, $43
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push33=, 9
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.xor $push30=, $26, $42
-; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $10
-; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $42
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.xor $push35=, $25, $41
-; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $9
-; NO-SIMD128-NEXT:    i32.xor $push37=, $pop36, $41
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop37
-; NO-SIMD128-NEXT:    i32.const $push41=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.xor $push38=, $24, $40
-; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $8
-; NO-SIMD128-NEXT:    i32.xor $push40=, $pop39, $40
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push46=, 6
-; NO-SIMD128-NEXT:    i32.add $push47=, $0, $pop46
-; NO-SIMD128-NEXT:    i32.xor $push43=, $23, $39
-; NO-SIMD128-NEXT:    i32.and $push44=, $pop43, $7
-; NO-SIMD128-NEXT:    i32.xor $push45=, $pop44, $39
-; NO-SIMD128-NEXT:    i32.store8 0($pop47), $pop45
-; NO-SIMD128-NEXT:    i32.const $push51=, 5
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.xor $push48=, $22, $38
-; NO-SIMD128-NEXT:    i32.and $push49=, $pop48, $6
-; NO-SIMD128-NEXT:    i32.xor $push50=, $pop49, $38
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.xor $push53=, $21, $37
-; NO-SIMD128-NEXT:    i32.and $push54=, $pop53, $5
-; NO-SIMD128-NEXT:    i32.xor $push55=, $pop54, $37
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop55
-; NO-SIMD128-NEXT:    i32.const $push59=, 3
-; NO-SIMD128-NEXT:    i32.add $push60=, $0, $pop59
-; NO-SIMD128-NEXT:    i32.xor $push56=, $20, $36
-; NO-SIMD128-NEXT:    i32.and $push57=, $pop56, $4
-; NO-SIMD128-NEXT:    i32.xor $push58=, $pop57, $36
-; NO-SIMD128-NEXT:    i32.store8 0($pop60), $pop58
-; NO-SIMD128-NEXT:    i32.xor $push61=, $19, $35
-; NO-SIMD128-NEXT:    i32.and $push62=, $pop61, $3
-; NO-SIMD128-NEXT:    i32.xor $push63=, $pop62, $35
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop63
-; NO-SIMD128-NEXT:    i32.xor $push64=, $18, $34
-; NO-SIMD128-NEXT:    i32.and $push65=, $pop64, $2
-; NO-SIMD128-NEXT:    i32.xor $push66=, $pop65, $34
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop66
-; NO-SIMD128-NEXT:    i32.xor $push67=, $17, $33
-; NO-SIMD128-NEXT:    i32.and $push68=, $pop67, $1
-; NO-SIMD128-NEXT:    i32.xor $push69=, $pop68, $33
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop69
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $31, $47
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $15
+; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $47
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $30, $46
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $14
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $46
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $29, $45
+; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $13
+; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $45
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop11
+; NO-SIMD128-NEXT:    i32.xor $push12=, $28, $44
+; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $12
+; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $44
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push15=, $27, $43
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $11
+; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $43
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop17
+; NO-SIMD128-NEXT:    i32.xor $push18=, $26, $42
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $10
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $42
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push21=, $25, $41
+; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $9
+; NO-SIMD128-NEXT:    i32.xor $push23=, $pop22, $41
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop23
+; NO-SIMD128-NEXT:    i32.xor $push24=, $24, $40
+; NO-SIMD128-NEXT:    i32.and $push25=, $pop24, $8
+; NO-SIMD128-NEXT:    i32.xor $push26=, $pop25, $40
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop26
+; NO-SIMD128-NEXT:    i32.xor $push27=, $23, $39
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $7
+; NO-SIMD128-NEXT:    i32.xor $push29=, $pop28, $39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop29
+; NO-SIMD128-NEXT:    i32.xor $push30=, $22, $38
+; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $6
+; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $38
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop32
+; NO-SIMD128-NEXT:    i32.xor $push33=, $21, $37
+; NO-SIMD128-NEXT:    i32.and $push34=, $pop33, $5
+; NO-SIMD128-NEXT:    i32.xor $push35=, $pop34, $37
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop35
+; NO-SIMD128-NEXT:    i32.xor $push36=, $20, $36
+; NO-SIMD128-NEXT:    i32.and $push37=, $pop36, $4
+; NO-SIMD128-NEXT:    i32.xor $push38=, $pop37, $36
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop38
+; NO-SIMD128-NEXT:    i32.xor $push39=, $19, $35
+; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $3
+; NO-SIMD128-NEXT:    i32.xor $push41=, $pop40, $35
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop41
+; NO-SIMD128-NEXT:    i32.xor $push42=, $18, $34
+; NO-SIMD128-NEXT:    i32.and $push43=, $pop42, $2
+; NO-SIMD128-NEXT:    i32.xor $push44=, $pop43, $34
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop44
+; NO-SIMD128-NEXT:    i32.xor $push45=, $17, $33
+; NO-SIMD128-NEXT:    i32.and $push46=, $pop45, $1
+; NO-SIMD128-NEXT:    i32.xor $push47=, $pop46, $33
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop47
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_v16i8:
@@ -5849,80 +4771,58 @@ define <16 x i8> @bitselect_xor_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $35
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $20, $36
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $4
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $pop12, $36
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop10), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $21, $37
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $37
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $22, $38
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $6
-; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $pop20, $38
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $23, $39
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $7
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $39
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop23), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $24, $40
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $pop29, $8
-; NO-SIMD128-FAST-NEXT:    i32.xor $push31=, $pop30, $40
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $25, $41
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $pop32, $9
-; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $pop33, $41
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.xor $push37=, $26, $42
-; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $pop37, $10
-; NO-SIMD128-FAST-NEXT:    i32.xor $push39=, $pop38, $42
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push41=, $0, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $27, $43
-; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $11
-; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $43
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop41), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.xor $push47=, $28, $44
-; NO-SIMD128-FAST-NEXT:    i32.and $push48=, $pop47, $12
-; NO-SIMD128-FAST-NEXT:    i32.xor $push49=, $pop48, $44
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop46), $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push51=, $0, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.xor $push52=, $29, $45
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $pop52, $13
-; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $pop53, $45
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop51), $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.xor $push57=, $30, $46
-; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $pop57, $14
-; NO-SIMD128-FAST-NEXT:    i32.xor $push59=, $pop58, $46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop59
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $0, $pop60
-; NO-SIMD128-FAST-NEXT:    i32.xor $push62=, $31, $47
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $pop62, $15
-; NO-SIMD128-FAST-NEXT:    i32.xor $push64=, $pop63, $47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop61), $pop64
-; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push66=, $0, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.xor $push67=, $32, $48
-; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $pop67, $16
-; NO-SIMD128-FAST-NEXT:    i32.xor $push69=, $pop68, $48
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop66), $pop69
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $20, $36
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $36
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $21, $37
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $pop13, $37
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $22, $38
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $pop16, $38
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $23, $39
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $7
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $39
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $24, $40
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $pop22, $40
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $25, $41
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $9
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $41
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.xor $push27=, $26, $42
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $10
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $pop28, $42
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $27, $43
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $43
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $28, $44
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $12
+; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $pop34, $44
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $29, $45
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $pop37, $45
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.xor $push39=, $30, $46
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $pop40, $46
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $31, $47
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $15
+; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $47
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $32, $48
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $pop45, $16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push47=, $pop46, $48
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop47
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <16 x i8> %v1, %v2
  %and = and <16 x i8> %xor1, %c
@@ -5949,124 +4849,102 @@ define <16 x i8> @bitselect_xor_reversed_v16i8(<16 x i8> %c, <16 x i8> %v1, <16
 ; NO-SIMD128-LABEL: bitselect_xor_reversed_v16i8:
 ; NO-SIMD128:         .functype bitselect_xor_reversed_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 15
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $32, $48
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push1=, $16, $pop0
 ; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.xor $push4=, $pop3, $48
-; NO-SIMD128-NEXT:    i32.store8 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.xor $push8=, $31, $47
-; NO-SIMD128-NEXT:    i32.const $push101=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $15, $pop101
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.xor $push10=, $pop9, $47
-; NO-SIMD128-NEXT:    i32.store8 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 13
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.xor $push14=, $30, $46
-; NO-SIMD128-NEXT:    i32.const $push100=, -1
-; NO-SIMD128-NEXT:    i32.xor $push13=, $14, $pop100
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push6=, $31, $47
+; NO-SIMD128-NEXT:    i32.const $push79=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $15, $pop79
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $47
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push10=, $30, $46
+; NO-SIMD128-NEXT:    i32.const $push78=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $14, $pop78
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.xor $push12=, $pop11, $46
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop12
+; NO-SIMD128-NEXT:    i32.xor $push14=, $29, $45
+; NO-SIMD128-NEXT:    i32.const $push77=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $13, $pop77
 ; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.xor $push16=, $pop15, $46
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push23=, 12
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.xor $push20=, $29, $45
-; NO-SIMD128-NEXT:    i32.const $push99=, -1
-; NO-SIMD128-NEXT:    i32.xor $push19=, $13, $pop99
-; NO-SIMD128-NEXT:    i32.and $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.xor $push22=, $pop21, $45
-; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.xor $push26=, $28, $44
-; NO-SIMD128-NEXT:    i32.const $push98=, -1
-; NO-SIMD128-NEXT:    i32.xor $push25=, $12, $pop98
+; NO-SIMD128-NEXT:    i32.xor $push16=, $pop15, $45
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop16
+; NO-SIMD128-NEXT:    i32.xor $push18=, $28, $44
+; NO-SIMD128-NEXT:    i32.const $push76=, -1
+; NO-SIMD128-NEXT:    i32.xor $push17=, $12, $pop76
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $44
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push22=, $27, $43
+; NO-SIMD128-NEXT:    i32.const $push75=, -1
+; NO-SIMD128-NEXT:    i32.xor $push21=, $11, $pop75
+; NO-SIMD128-NEXT:    i32.and $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.xor $push24=, $pop23, $43
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop24
+; NO-SIMD128-NEXT:    i32.xor $push26=, $26, $42
+; NO-SIMD128-NEXT:    i32.const $push74=, -1
+; NO-SIMD128-NEXT:    i32.xor $push25=, $10, $pop74
 ; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $pop25
-; NO-SIMD128-NEXT:    i32.xor $push28=, $pop27, $44
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push35=, 10
-; NO-SIMD128-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-NEXT:    i32.xor $push32=, $27, $43
-; NO-SIMD128-NEXT:    i32.const $push97=, -1
-; NO-SIMD128-NEXT:    i32.xor $push31=, $11, $pop97
-; NO-SIMD128-NEXT:    i32.and $push33=, $pop32, $pop31
-; NO-SIMD128-NEXT:    i32.xor $push34=, $pop33, $43
-; NO-SIMD128-NEXT:    i32.store8 0($pop36), $pop34
-; NO-SIMD128-NEXT:    i32.const $push41=, 9
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.xor $push38=, $26, $42
-; NO-SIMD128-NEXT:    i32.const $push96=, -1
-; NO-SIMD128-NEXT:    i32.xor $push37=, $10, $pop96
+; NO-SIMD128-NEXT:    i32.xor $push28=, $pop27, $42
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop28
+; NO-SIMD128-NEXT:    i32.xor $push30=, $25, $41
+; NO-SIMD128-NEXT:    i32.const $push73=, -1
+; NO-SIMD128-NEXT:    i32.xor $push29=, $9, $pop73
+; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $41
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop32
+; NO-SIMD128-NEXT:    i32.xor $push34=, $24, $40
+; NO-SIMD128-NEXT:    i32.const $push72=, -1
+; NO-SIMD128-NEXT:    i32.xor $push33=, $8, $pop72
+; NO-SIMD128-NEXT:    i32.and $push35=, $pop34, $pop33
+; NO-SIMD128-NEXT:    i32.xor $push36=, $pop35, $40
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop36
+; NO-SIMD128-NEXT:    i32.xor $push38=, $23, $39
+; NO-SIMD128-NEXT:    i32.const $push71=, -1
+; NO-SIMD128-NEXT:    i32.xor $push37=, $7, $pop71
 ; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.xor $push40=, $pop39, $42
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.xor $push44=, $25, $41
-; NO-SIMD128-NEXT:    i32.const $push95=, -1
-; NO-SIMD128-NEXT:    i32.xor $push43=, $9, $pop95
-; NO-SIMD128-NEXT:    i32.and $push45=, $pop44, $pop43
-; NO-SIMD128-NEXT:    i32.xor $push46=, $pop45, $41
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop46
-; NO-SIMD128-NEXT:    i32.const $push51=, 7
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.xor $push48=, $24, $40
-; NO-SIMD128-NEXT:    i32.const $push94=, -1
-; NO-SIMD128-NEXT:    i32.xor $push47=, $8, $pop94
-; NO-SIMD128-NEXT:    i32.and $push49=, $pop48, $pop47
-; NO-SIMD128-NEXT:    i32.xor $push50=, $pop49, $40
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.const $push57=, 6
-; NO-SIMD128-NEXT:    i32.add $push58=, $0, $pop57
-; NO-SIMD128-NEXT:    i32.xor $push54=, $23, $39
-; NO-SIMD128-NEXT:    i32.const $push93=, -1
-; NO-SIMD128-NEXT:    i32.xor $push53=, $7, $pop93
+; NO-SIMD128-NEXT:    i32.xor $push40=, $pop39, $39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop40
+; NO-SIMD128-NEXT:    i32.xor $push42=, $22, $38
+; NO-SIMD128-NEXT:    i32.const $push70=, -1
+; NO-SIMD128-NEXT:    i32.xor $push41=, $6, $pop70
+; NO-SIMD128-NEXT:    i32.and $push43=, $pop42, $pop41
+; NO-SIMD128-NEXT:    i32.xor $push44=, $pop43, $38
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop44
+; NO-SIMD128-NEXT:    i32.xor $push46=, $21, $37
+; NO-SIMD128-NEXT:    i32.const $push69=, -1
+; NO-SIMD128-NEXT:    i32.xor $push45=, $5, $pop69
+; NO-SIMD128-NEXT:    i32.and $push47=, $pop46, $pop45
+; NO-SIMD128-NEXT:    i32.xor $push48=, $pop47, $37
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop48
+; NO-SIMD128-NEXT:    i32.xor $push50=, $20, $36
+; NO-SIMD128-NEXT:    i32.const $push68=, -1
+; NO-SIMD128-NEXT:    i32.xor $push49=, $4, $pop68
+; NO-SIMD128-NEXT:    i32.and $push51=, $pop50, $pop49
+; NO-SIMD128-NEXT:    i32.xor $push52=, $pop51, $36
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop52
+; NO-SIMD128-NEXT:    i32.xor $push54=, $19, $35
+; NO-SIMD128-NEXT:    i32.const $push67=, -1
+; NO-SIMD128-NEXT:    i32.xor $push53=, $3, $pop67
 ; NO-SIMD128-NEXT:    i32.and $push55=, $pop54, $pop53
-; NO-SIMD128-NEXT:    i32.xor $push56=, $pop55, $39
-; NO-SIMD128-NEXT:    i32.store8 0($pop58), $pop56
-; NO-SIMD128-NEXT:    i32.const $push63=, 5
-; NO-SIMD128-NEXT:    i32.add $push64=, $0, $pop63
-; NO-SIMD128-NEXT:    i32.xor $push60=, $22, $38
-; NO-SIMD128-NEXT:    i32.const $push92=, -1
-; NO-SIMD128-NEXT:    i32.xor $push59=, $6, $pop92
-; NO-SIMD128-NEXT:    i32.and $push61=, $pop60, $pop59
-; NO-SIMD128-NEXT:    i32.xor $push62=, $pop61, $38
-; NO-SIMD128-NEXT:    i32.store8 0($pop64), $pop62
-; NO-SIMD128-NEXT:    i32.xor $push66=, $21, $37
-; NO-SIMD128-NEXT:    i32.const $push91=, -1
-; NO-SIMD128-NEXT:    i32.xor $push65=, $5, $pop91
-; NO-SIMD128-NEXT:    i32.and $push67=, $pop66, $pop65
-; NO-SIMD128-NEXT:    i32.xor $push68=, $pop67, $37
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop68
-; NO-SIMD128-NEXT:    i32.const $push73=, 3
-; NO-SIMD128-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-NEXT:    i32.xor $push70=, $20, $36
-; NO-SIMD128-NEXT:    i32.const $push90=, -1
-; NO-SIMD128-NEXT:    i32.xor $push69=, $4, $pop90
-; NO-SIMD128-NEXT:    i32.and $push71=, $pop70, $pop69
-; NO-SIMD128-NEXT:    i32.xor $push72=, $pop71, $36
-; NO-SIMD128-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-NEXT:    i32.xor $push76=, $19, $35
-; NO-SIMD128-NEXT:    i32.const $push89=, -1
-; NO-SIMD128-NEXT:    i32.xor $push75=, $3, $pop89
-; NO-SIMD128-NEXT:    i32.and $push77=, $pop76, $pop75
-; NO-SIMD128-NEXT:    i32.xor $push78=, $pop77, $35
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop78
-; NO-SIMD128-NEXT:    i32.xor $push80=, $18, $34
-; NO-SIMD128-NEXT:    i32.const $push88=, -1
-; NO-SIMD128-NEXT:    i32.xor $push79=, $2, $pop88
-; NO-SIMD128-NEXT:    i32.and $push81=, $pop80, $pop79
-; NO-SIMD128-NEXT:    i32.xor $push82=, $pop81, $34
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop82
-; NO-SIMD128-NEXT:    i32.xor $push84=, $17, $33
-; NO-SIMD128-NEXT:    i32.const $push87=, -1
-; NO-SIMD128-NEXT:    i32.xor $push83=, $1, $pop87
-; NO-SIMD128-NEXT:    i32.and $push85=, $pop84, $pop83
-; NO-SIMD128-NEXT:    i32.xor $push86=, $pop85, $33
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop86
+; NO-SIMD128-NEXT:    i32.xor $push56=, $pop55, $35
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop56
+; NO-SIMD128-NEXT:    i32.xor $push58=, $18, $34
+; NO-SIMD128-NEXT:    i32.const $push66=, -1
+; NO-SIMD128-NEXT:    i32.xor $push57=, $2, $pop66
+; NO-SIMD128-NEXT:    i32.and $push59=, $pop58, $pop57
+; NO-SIMD128-NEXT:    i32.xor $push60=, $pop59, $34
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop60
+; NO-SIMD128-NEXT:    i32.xor $push62=, $17, $33
+; NO-SIMD128-NEXT:    i32.const $push65=, -1
+; NO-SIMD128-NEXT:    i32.xor $push61=, $1, $pop65
+; NO-SIMD128-NEXT:    i32.and $push63=, $pop62, $pop61
+; NO-SIMD128-NEXT:    i32.xor $push64=, $pop63, $33
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop64
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_reversed_v16i8:
@@ -6079,117 +4957,95 @@ define <16 x i8> @bitselect_xor_reversed_v16i8(<16 x i8> %c, <16 x i8> %v1, <16
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $pop3, $33
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $18, $34
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop79
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $34
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $19, $35
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop100
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop78
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $pop11, $35
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $20, $36
-; NO-SIMD128-FAST-NEXT:    i32.const $push99=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop77
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $36
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $21, $37
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $5, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $pop21, $37
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $22, $38
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $6, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $38
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $23, $39
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $7, $pop96
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $21, $37
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $5, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $37
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $22, $38
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $6, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $pop23, $38
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $23, $39
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push25=, $7, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.xor $push28=, $pop27, $39
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $24, $40
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $8, $pop73
 ; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $39
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $24, $40
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $8, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $pop37, $40
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $25, $41
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $9, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $40
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $25, $41
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $9, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $pop34, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $pop35, $41
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $26, $42
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push37=, $10, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.and $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.xor $push40=, $pop39, $42
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop40
+; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $27, $43
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $11, $pop70
 ; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $41
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $26, $42
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $10, $pop93
+; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $43
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $28, $44
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $12, $pop69
 ; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $pop46, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.xor $push48=, $pop47, $42
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.xor $push52=, $27, $43
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push51=, $11, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $pop52, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $pop53, $43
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $28, $44
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push57=, $12, $pop91
+; NO-SIMD128-FAST-NEXT:    i32.xor $push48=, $pop47, $44
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.xor $push50=, $29, $45
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push49=, $13, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $pop50, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.xor $push52=, $pop51, $45
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $30, $46
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push53=, $14, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.and $push55=, $pop54, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.xor $push56=, $pop55, $46
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop56
+; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $31, $47
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push57=, $15, $pop66
 ; NO-SIMD128-FAST-NEXT:    i32.and $push59=, $pop58, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.xor $push60=, $pop59, $44
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push68=, $0, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.xor $push64=, $29, $45
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push63=, $13, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.and $push65=, $pop64, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.xor $push66=, $pop65, $45
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop68), $pop66
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.xor $push70=, $30, $46
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push69=, $14, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.and $push71=, $pop70, $pop69
-; NO-SIMD128-FAST-NEXT:    i32.xor $push72=, $pop71, $46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $0, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.xor $push76=, $31, $47
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push75=, $15, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.and $push77=, $pop76, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.xor $push78=, $pop77, $47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop80), $pop78
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push86=, $0, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.xor $push82=, $32, $48
-; NO-SIMD128-FAST-NEXT:    i32.const $push87=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push81=, $16, $pop87
-; NO-SIMD128-FAST-NEXT:    i32.and $push83=, $pop82, $pop81
-; NO-SIMD128-FAST-NEXT:    i32.xor $push84=, $pop83, $48
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop86), $pop84
+; NO-SIMD128-FAST-NEXT:    i32.xor $push60=, $pop59, $47
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.xor $push62=, $32, $48
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push61=, $16, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $pop62, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.xor $push64=, $pop63, $48
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop64
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <16 x i8> %v1, %v2
  %notc = xor <16 x i8> %c, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
@@ -6218,30 +5074,22 @@ define <8 x i16> @add_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: add_v8i16:
 ; NO-SIMD128:         .functype add_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.add $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.add $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.add $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.add $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.add $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.add $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.add $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.add $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.add $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.add $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.add $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.add $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.add $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.add $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.add $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: add_v8i16:
@@ -6253,24 +5101,16 @@ define <8 x i16> @add_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.add $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -6292,30 +5132,22 @@ define <8 x i16> @sub_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: sub_v8i16:
 ; NO-SIMD128:         .functype sub_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.sub $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.sub $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.sub $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.sub $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.sub $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.sub $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.sub $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.sub $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.sub $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.sub $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.sub $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.sub $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.sub $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.sub $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.sub $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: sub_v8i16:
@@ -6327,24 +5159,16 @@ define <8 x i16> @sub_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -6366,30 +5190,22 @@ define <8 x i16> @mul_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: mul_v8i16:
 ; NO-SIMD128:         .functype mul_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.mul $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.mul $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.mul $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.mul $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.mul $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.mul $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.mul $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.mul $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.mul $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.mul $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.mul $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.mul $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.mul $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.mul $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.mul $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.mul $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: mul_v8i16:
@@ -6401,24 +5217,16 @@ define <8 x i16> @mul_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.mul $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.mul $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.mul $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = mul <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -6440,54 +5248,46 @@ define <8 x i16> @min_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: min_s_v8i16:
 ; NO-SIMD128:         .functype min_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 14
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $8
 ; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $16
 ; NO-SIMD128-NEXT:    i32.lt_s $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $8, $16, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $7
-; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $15
-; NO-SIMD128-NEXT:    i32.lt_s $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $7, $15, $pop8
-; NO-SIMD128-NEXT:    i32.store16 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push16=, 10
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $6
-; NO-SIMD128-NEXT:    i32.extend16_s $push12=, $14
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $7
+; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $15
+; NO-SIMD128-NEXT:    i32.lt_s $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $7, $15, $pop6
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $6
+; NO-SIMD128-NEXT:    i32.extend16_s $push8=, $14
+; NO-SIMD128-NEXT:    i32.lt_s $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $6, $14, $pop10
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push12=, $13
 ; NO-SIMD128-NEXT:    i32.lt_s $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.select $push15=, $6, $14, $pop14
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $5
-; NO-SIMD128-NEXT:    i32.extend16_s $push18=, $13
-; NO-SIMD128-NEXT:    i32.lt_s $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.select $push21=, $5, $13, $pop20
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop21
-; NO-SIMD128-NEXT:    i32.const $push26=, 6
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-NEXT:    i32.extend16_s $push23=, $4
-; NO-SIMD128-NEXT:    i32.extend16_s $push22=, $12
-; NO-SIMD128-NEXT:    i32.lt_s $push24=, $pop23, $pop22
-; NO-SIMD128-NEXT:    i32.select $push25=, $4, $12, $pop24
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $3
-; NO-SIMD128-NEXT:    i32.extend16_s $push28=, $11
+; NO-SIMD128-NEXT:    i32.select $push15=, $5, $13, $pop14
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend16_s $push17=, $4
+; NO-SIMD128-NEXT:    i32.extend16_s $push16=, $12
+; NO-SIMD128-NEXT:    i32.lt_s $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.select $push19=, $4, $12, $pop18
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop19
+; NO-SIMD128-NEXT:    i32.extend16_s $push21=, $3
+; NO-SIMD128-NEXT:    i32.extend16_s $push20=, $11
+; NO-SIMD128-NEXT:    i32.lt_s $push22=, $pop21, $pop20
+; NO-SIMD128-NEXT:    i32.select $push23=, $3, $11, $pop22
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop23
+; NO-SIMD128-NEXT:    i32.extend16_s $push25=, $2
+; NO-SIMD128-NEXT:    i32.extend16_s $push24=, $10
+; NO-SIMD128-NEXT:    i32.lt_s $push26=, $pop25, $pop24
+; NO-SIMD128-NEXT:    i32.select $push27=, $2, $10, $pop26
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $1
+; NO-SIMD128-NEXT:    i32.extend16_s $push28=, $9
 ; NO-SIMD128-NEXT:    i32.lt_s $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.select $push31=, $3, $11, $pop30
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop31
-; NO-SIMD128-NEXT:    i32.extend16_s $push33=, $2
-; NO-SIMD128-NEXT:    i32.extend16_s $push32=, $10
-; NO-SIMD128-NEXT:    i32.lt_s $push34=, $pop33, $pop32
-; NO-SIMD128-NEXT:    i32.select $push35=, $2, $10, $pop34
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop35
-; NO-SIMD128-NEXT:    i32.extend16_s $push37=, $1
-; NO-SIMD128-NEXT:    i32.extend16_s $push36=, $9
-; NO-SIMD128-NEXT:    i32.lt_s $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.select $push39=, $1, $9, $pop38
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop39
+; NO-SIMD128-NEXT:    i32.select $push31=, $1, $9, $pop30
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop31
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_s_v8i16:
@@ -6508,39 +5308,31 @@ define <8 x i16> @min_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $3, $11, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $4
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push12=, $12
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $4, $12, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push19=, $5
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push18=, $13
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.select $push21=, $5, $13, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $6
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push22=, $14
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push24=, $pop23, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.select $push25=, $6, $14, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push29=, $7
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push17=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push16=, $13
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.select $push19=, $5, $13, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push21=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push20=, $14
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push22=, $pop21, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.select $push23=, $6, $14, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push25=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push24=, $15
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push26=, $pop25, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.select $push27=, $7, $15, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push29=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $16
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $7, $15, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop33), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push35=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push34=, $16
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push36=, $pop35, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.select $push37=, $8, $16, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop39), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $8, $16, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop31
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp slt <8 x i16> %x, %y
   %a = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
@@ -6563,70 +5355,62 @@ define <8 x i16> @min_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: min_u_v8i16:
 ; NO-SIMD128:         .functype min_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
-; NO-SIMD128-NEXT:    i32.const $push55=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop55
+; NO-SIMD128-NEXT:    i32.const $push47=, 65535
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop47
 ; NO-SIMD128-NEXT:    i32.lt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.select $push4=, $8, $16, $pop3
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push54=, 65535
-; NO-SIMD128-NEXT:    i32.and $push8=, $7, $pop54
-; NO-SIMD128-NEXT:    i32.const $push53=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $15, $pop53
-; NO-SIMD128-NEXT:    i32.lt_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.select $push10=, $7, $15, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push52=, 65535
-; NO-SIMD128-NEXT:    i32.and $push14=, $6, $pop52
-; NO-SIMD128-NEXT:    i32.const $push51=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $14, $pop51
-; NO-SIMD128-NEXT:    i32.lt_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.select $push16=, $6, $14, $pop15
-; NO-SIMD128-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push50=, 65535
-; NO-SIMD128-NEXT:    i32.and $push20=, $5, $pop50
-; NO-SIMD128-NEXT:    i32.const $push49=, 65535
-; NO-SIMD128-NEXT:    i32.and $push19=, $13, $pop49
-; NO-SIMD128-NEXT:    i32.lt_u $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.select $push22=, $5, $13, $pop21
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-NEXT:    i32.const $push27=, 6
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.const $push48=, 65535
-; NO-SIMD128-NEXT:    i32.and $push24=, $4, $pop48
-; NO-SIMD128-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-NEXT:    i32.and $push23=, $12, $pop47
-; NO-SIMD128-NEXT:    i32.lt_u $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.select $push26=, $4, $12, $pop25
-; NO-SIMD128-NEXT:    i32.store16 0($pop28), $pop26
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop4
 ; NO-SIMD128-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-NEXT:    i32.and $push30=, $3, $pop46
+; NO-SIMD128-NEXT:    i32.and $push6=, $7, $pop46
 ; NO-SIMD128-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-NEXT:    i32.and $push29=, $11, $pop45
-; NO-SIMD128-NEXT:    i32.lt_u $push31=, $pop30, $pop29
-; NO-SIMD128-NEXT:    i32.select $push32=, $3, $11, $pop31
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop32
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $pop45
+; NO-SIMD128-NEXT:    i32.lt_u $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.select $push8=, $7, $15, $pop7
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop8
 ; NO-SIMD128-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-NEXT:    i32.and $push34=, $2, $pop44
+; NO-SIMD128-NEXT:    i32.and $push10=, $6, $pop44
 ; NO-SIMD128-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-NEXT:    i32.and $push33=, $10, $pop43
-; NO-SIMD128-NEXT:    i32.lt_u $push35=, $pop34, $pop33
-; NO-SIMD128-NEXT:    i32.select $push36=, $2, $10, $pop35
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop36
+; NO-SIMD128-NEXT:    i32.and $push9=, $14, $pop43
+; NO-SIMD128-NEXT:    i32.lt_u $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.select $push12=, $6, $14, $pop11
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop12
 ; NO-SIMD128-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-NEXT:    i32.and $push38=, $1, $pop42
+; NO-SIMD128-NEXT:    i32.and $push14=, $5, $pop42
 ; NO-SIMD128-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-NEXT:    i32.and $push37=, $9, $pop41
-; NO-SIMD128-NEXT:    i32.lt_u $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.select $push40=, $1, $9, $pop39
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop40
+; NO-SIMD128-NEXT:    i32.and $push13=, $13, $pop41
+; NO-SIMD128-NEXT:    i32.lt_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.select $push16=, $5, $13, $pop15
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, 65535
+; NO-SIMD128-NEXT:    i32.and $push18=, $4, $pop40
+; NO-SIMD128-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-NEXT:    i32.and $push17=, $12, $pop39
+; NO-SIMD128-NEXT:    i32.lt_u $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.select $push20=, $4, $12, $pop19
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push38=, 65535
+; NO-SIMD128-NEXT:    i32.and $push22=, $3, $pop38
+; NO-SIMD128-NEXT:    i32.const $push37=, 65535
+; NO-SIMD128-NEXT:    i32.and $push21=, $11, $pop37
+; NO-SIMD128-NEXT:    i32.lt_u $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.select $push24=, $3, $11, $pop23
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push36=, 65535
+; NO-SIMD128-NEXT:    i32.and $push26=, $2, $pop36
+; NO-SIMD128-NEXT:    i32.const $push35=, 65535
+; NO-SIMD128-NEXT:    i32.and $push25=, $10, $pop35
+; NO-SIMD128-NEXT:    i32.lt_u $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.select $push28=, $2, $10, $pop27
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, 65535
+; NO-SIMD128-NEXT:    i32.and $push30=, $1, $pop34
+; NO-SIMD128-NEXT:    i32.const $push33=, 65535
+; NO-SIMD128-NEXT:    i32.and $push29=, $9, $pop33
+; NO-SIMD128-NEXT:    i32.lt_u $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.select $push32=, $1, $9, $pop31
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_u_v8i16:
@@ -6634,68 +5418,60 @@ define <8 x i16> @min_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop47
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.select $push4=, $1, $9, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop45
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.select $push8=, $2, $10, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $pop43
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.select $push12=, $3, $11, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $pop41
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.select $push16=, $4, $12, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $5, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $13, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.select $push22=, $5, $13, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $6, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $14, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.select $push26=, $6, $14, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $7, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $15, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $5, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $13, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.select $push20=, $5, $13, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $6, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $14, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.select $push24=, $6, $14, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $7, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $15, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.select $push28=, $7, $15, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $8, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $16, $pop33
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $7, $15, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $8, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $16, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.select $push38=, $8, $16, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $8, $16, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ult <8 x i16> %x, %y
   %a = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
@@ -6718,54 +5494,46 @@ define <8 x i16> @max_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: max_s_v8i16:
 ; NO-SIMD128:         .functype max_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 14
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $8
 ; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $16
 ; NO-SIMD128-NEXT:    i32.gt_s $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $8, $16, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $7
-; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $15
-; NO-SIMD128-NEXT:    i32.gt_s $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $7, $15, $pop8
-; NO-SIMD128-NEXT:    i32.store16 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push16=, 10
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $6
-; NO-SIMD128-NEXT:    i32.extend16_s $push12=, $14
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $7
+; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $15
+; NO-SIMD128-NEXT:    i32.gt_s $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $7, $15, $pop6
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $6
+; NO-SIMD128-NEXT:    i32.extend16_s $push8=, $14
+; NO-SIMD128-NEXT:    i32.gt_s $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $6, $14, $pop10
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push12=, $13
 ; NO-SIMD128-NEXT:    i32.gt_s $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.select $push15=, $6, $14, $pop14
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $5
-; NO-SIMD128-NEXT:    i32.extend16_s $push18=, $13
-; NO-SIMD128-NEXT:    i32.gt_s $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.select $push21=, $5, $13, $pop20
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop21
-; NO-SIMD128-NEXT:    i32.const $push26=, 6
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-NEXT:    i32.extend16_s $push23=, $4
-; NO-SIMD128-NEXT:    i32.extend16_s $push22=, $12
-; NO-SIMD128-NEXT:    i32.gt_s $push24=, $pop23, $pop22
-; NO-SIMD128-NEXT:    i32.select $push25=, $4, $12, $pop24
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $3
-; NO-SIMD128-NEXT:    i32.extend16_s $push28=, $11
+; NO-SIMD128-NEXT:    i32.select $push15=, $5, $13, $pop14
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend16_s $push17=, $4
+; NO-SIMD128-NEXT:    i32.extend16_s $push16=, $12
+; NO-SIMD128-NEXT:    i32.gt_s $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.select $push19=, $4, $12, $pop18
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop19
+; NO-SIMD128-NEXT:    i32.extend16_s $push21=, $3
+; NO-SIMD128-NEXT:    i32.extend16_s $push20=, $11
+; NO-SIMD128-NEXT:    i32.gt_s $push22=, $pop21, $pop20
+; NO-SIMD128-NEXT:    i32.select $push23=, $3, $11, $pop22
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop23
+; NO-SIMD128-NEXT:    i32.extend16_s $push25=, $2
+; NO-SIMD128-NEXT:    i32.extend16_s $push24=, $10
+; NO-SIMD128-NEXT:    i32.gt_s $push26=, $pop25, $pop24
+; NO-SIMD128-NEXT:    i32.select $push27=, $2, $10, $pop26
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $1
+; NO-SIMD128-NEXT:    i32.extend16_s $push28=, $9
 ; NO-SIMD128-NEXT:    i32.gt_s $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.select $push31=, $3, $11, $pop30
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop31
-; NO-SIMD128-NEXT:    i32.extend16_s $push33=, $2
-; NO-SIMD128-NEXT:    i32.extend16_s $push32=, $10
-; NO-SIMD128-NEXT:    i32.gt_s $push34=, $pop33, $pop32
-; NO-SIMD128-NEXT:    i32.select $push35=, $2, $10, $pop34
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop35
-; NO-SIMD128-NEXT:    i32.extend16_s $push37=, $1
-; NO-SIMD128-NEXT:    i32.extend16_s $push36=, $9
-; NO-SIMD128-NEXT:    i32.gt_s $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.select $push39=, $1, $9, $pop38
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop39
+; NO-SIMD128-NEXT:    i32.select $push31=, $1, $9, $pop30
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop31
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_s_v8i16:
@@ -6786,39 +5554,31 @@ define <8 x i16> @max_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $3, $11, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $4
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push12=, $12
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $4, $12, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push19=, $5
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push18=, $13
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.select $push21=, $5, $13, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $6
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push22=, $14
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push24=, $pop23, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.select $push25=, $6, $14, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push29=, $7
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push17=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push16=, $13
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.select $push19=, $5, $13, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push21=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push20=, $14
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push22=, $pop21, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.select $push23=, $6, $14, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push25=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push24=, $15
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push26=, $pop25, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.select $push27=, $7, $15, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push29=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $16
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $7, $15, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop33), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push35=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push34=, $16
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push36=, $pop35, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.select $push37=, $8, $16, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop39), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $8, $16, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop31
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp sgt <8 x i16> %x, %y
   %a = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
@@ -6841,70 +5601,62 @@ define <8 x i16> @max_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: max_u_v8i16:
 ; NO-SIMD128:         .functype max_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
-; NO-SIMD128-NEXT:    i32.const $push55=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop55
+; NO-SIMD128-NEXT:    i32.const $push47=, 65535
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop47
 ; NO-SIMD128-NEXT:    i32.gt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.select $push4=, $8, $16, $pop3
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push54=, 65535
-; NO-SIMD128-NEXT:    i32.and $push8=, $7, $pop54
-; NO-SIMD128-NEXT:    i32.const $push53=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $15, $pop53
-; NO-SIMD128-NEXT:    i32.gt_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.select $push10=, $7, $15, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push52=, 65535
-; NO-SIMD128-NEXT:    i32.and $push14=, $6, $pop52
-; NO-SIMD128-NEXT:    i32.const $push51=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $14, $pop51
-; NO-SIMD128-NEXT:    i32.gt_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.select $push16=, $6, $14, $pop15
-; NO-SIMD128-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push50=, 65535
-; NO-SIMD128-NEXT:    i32.and $push20=, $5, $pop50
-; NO-SIMD128-NEXT:    i32.const $push49=, 65535
-; NO-SIMD128-NEXT:    i32.and $push19=, $13, $pop49
-; NO-SIMD128-NEXT:    i32.gt_u $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.select $push22=, $5, $13, $pop21
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-NEXT:    i32.const $push27=, 6
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.const $push48=, 65535
-; NO-SIMD128-NEXT:    i32.and $push24=, $4, $pop48
-; NO-SIMD128-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-NEXT:    i32.and $push23=, $12, $pop47
-; NO-SIMD128-NEXT:    i32.gt_u $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.select $push26=, $4, $12, $pop25
-; NO-SIMD128-NEXT:    i32.store16 0($pop28), $pop26
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop4
 ; NO-SIMD128-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-NEXT:    i32.and $push30=, $3, $pop46
+; NO-SIMD128-NEXT:    i32.and $push6=, $7, $pop46
 ; NO-SIMD128-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-NEXT:    i32.and $push29=, $11, $pop45
-; NO-SIMD128-NEXT:    i32.gt_u $push31=, $pop30, $pop29
-; NO-SIMD128-NEXT:    i32.select $push32=, $3, $11, $pop31
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop32
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $pop45
+; NO-SIMD128-NEXT:    i32.gt_u $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.select $push8=, $7, $15, $pop7
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop8
 ; NO-SIMD128-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-NEXT:    i32.and $push34=, $2, $pop44
+; NO-SIMD128-NEXT:    i32.and $push10=, $6, $pop44
 ; NO-SIMD128-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-NEXT:    i32.and $push33=, $10, $pop43
-; NO-SIMD128-NEXT:    i32.gt_u $push35=, $pop34, $pop33
-; NO-SIMD128-NEXT:    i32.select $push36=, $2, $10, $pop35
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop36
+; NO-SIMD128-NEXT:    i32.and $push9=, $14, $pop43
+; NO-SIMD128-NEXT:    i32.gt_u $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.select $push12=, $6, $14, $pop11
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop12
 ; NO-SIMD128-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-NEXT:    i32.and $push38=, $1, $pop42
+; NO-SIMD128-NEXT:    i32.and $push14=, $5, $pop42
 ; NO-SIMD128-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-NEXT:    i32.and $push37=, $9, $pop41
-; NO-SIMD128-NEXT:    i32.gt_u $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.select $push40=, $1, $9, $pop39
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop40
+; NO-SIMD128-NEXT:    i32.and $push13=, $13, $pop41
+; NO-SIMD128-NEXT:    i32.gt_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.select $push16=, $5, $13, $pop15
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, 65535
+; NO-SIMD128-NEXT:    i32.and $push18=, $4, $pop40
+; NO-SIMD128-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-NEXT:    i32.and $push17=, $12, $pop39
+; NO-SIMD128-NEXT:    i32.gt_u $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.select $push20=, $4, $12, $pop19
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push38=, 65535
+; NO-SIMD128-NEXT:    i32.and $push22=, $3, $pop38
+; NO-SIMD128-NEXT:    i32.const $push37=, 65535
+; NO-SIMD128-NEXT:    i32.and $push21=, $11, $pop37
+; NO-SIMD128-NEXT:    i32.gt_u $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.select $push24=, $3, $11, $pop23
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push36=, 65535
+; NO-SIMD128-NEXT:    i32.and $push26=, $2, $pop36
+; NO-SIMD128-NEXT:    i32.const $push35=, 65535
+; NO-SIMD128-NEXT:    i32.and $push25=, $10, $pop35
+; NO-SIMD128-NEXT:    i32.gt_u $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.select $push28=, $2, $10, $pop27
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, 65535
+; NO-SIMD128-NEXT:    i32.and $push30=, $1, $pop34
+; NO-SIMD128-NEXT:    i32.const $push33=, 65535
+; NO-SIMD128-NEXT:    i32.and $push29=, $9, $pop33
+; NO-SIMD128-NEXT:    i32.gt_u $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.select $push32=, $1, $9, $pop31
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_u_v8i16:
@@ -6912,68 +5664,60 @@ define <8 x i16> @max_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop47
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.select $push4=, $1, $9, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop45
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.select $push8=, $2, $10, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $pop43
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.select $push12=, $3, $11, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $pop41
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.select $push16=, $4, $12, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $5, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $13, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.select $push22=, $5, $13, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $6, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $14, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.select $push26=, $6, $14, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $7, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $15, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $5, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $13, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.select $push20=, $5, $13, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $6, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $14, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.select $push24=, $6, $14, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $7, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $15, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.select $push28=, $7, $15, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $8, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $16, $pop33
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $7, $15, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $8, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $16, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.select $push38=, $8, $16, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $8, $16, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ugt <8 x i16> %x, %y
   %a = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
@@ -6996,78 +5740,70 @@ define <8 x i16> @avgr_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: avgr_u_v8i16:
 ; NO-SIMD128:         .functype avgr_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.add $push2=, $8, $16
-; NO-SIMD128-NEXT:    i32.const $push3=, 1
-; NO-SIMD128-NEXT:    i32.add $push4=, $pop2, $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 65534
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop4, $pop5
-; NO-SIMD128-NEXT:    i32.const $push63=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push7=, $pop6, $pop63
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $pop7
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.add $push10=, $7, $15
-; NO-SIMD128-NEXT:    i32.const $push62=, 1
-; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop62
-; NO-SIMD128-NEXT:    i32.const $push61=, 65534
-; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop61
-; NO-SIMD128-NEXT:    i32.const $push60=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop60
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop13
-; NO-SIMD128-NEXT:    i32.const $push14=, 10
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.add $push16=, $6, $14
-; NO-SIMD128-NEXT:    i32.const $push59=, 1
-; NO-SIMD128-NEXT:    i32.add $push17=, $pop16, $pop59
-; NO-SIMD128-NEXT:    i32.const $push58=, 65534
-; NO-SIMD128-NEXT:    i32.and $push18=, $pop17, $pop58
-; NO-SIMD128-NEXT:    i32.const $push57=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push19=, $pop18, $pop57
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop19
-; NO-SIMD128-NEXT:    i32.add $push20=, $5, $13
-; NO-SIMD128-NEXT:    i32.const $push56=, 1
-; NO-SIMD128-NEXT:    i32.add $push21=, $pop20, $pop56
-; NO-SIMD128-NEXT:    i32.const $push55=, 65534
-; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $pop55
+; NO-SIMD128-NEXT:    i32.add $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.const $push1=, 1
+; NO-SIMD128-NEXT:    i32.add $push2=, $pop0, $pop1
+; NO-SIMD128-NEXT:    i32.const $push3=, 65534
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop2, $pop3
+; NO-SIMD128-NEXT:    i32.const $push55=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push5=, $pop4, $pop55
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $7, $15
 ; NO-SIMD128-NEXT:    i32.const $push54=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push23=, $pop22, $pop54
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop23
-; NO-SIMD128-NEXT:    i32.const $push24=, 6
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.add $push26=, $4, $12
-; NO-SIMD128-NEXT:    i32.const $push53=, 1
-; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop53
-; NO-SIMD128-NEXT:    i32.const $push52=, 65534
-; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop52
+; NO-SIMD128-NEXT:    i32.add $push7=, $pop6, $pop54
+; NO-SIMD128-NEXT:    i32.const $push53=, 65534
+; NO-SIMD128-NEXT:    i32.and $push8=, $pop7, $pop53
+; NO-SIMD128-NEXT:    i32.const $push52=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop52
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop9
+; NO-SIMD128-NEXT:    i32.add $push10=, $6, $14
 ; NO-SIMD128-NEXT:    i32.const $push51=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop51
-; NO-SIMD128-NEXT:    i32.store16 0($pop25), $pop29
-; NO-SIMD128-NEXT:    i32.add $push30=, $3, $11
-; NO-SIMD128-NEXT:    i32.const $push50=, 1
-; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop50
-; NO-SIMD128-NEXT:    i32.const $push49=, 65534
-; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop49
+; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop51
+; NO-SIMD128-NEXT:    i32.const $push50=, 65534
+; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop50
+; NO-SIMD128-NEXT:    i32.const $push49=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop49
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop13
+; NO-SIMD128-NEXT:    i32.add $push14=, $5, $13
 ; NO-SIMD128-NEXT:    i32.const $push48=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop48
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop33
-; NO-SIMD128-NEXT:    i32.add $push34=, $2, $10
-; NO-SIMD128-NEXT:    i32.const $push47=, 1
-; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop47
-; NO-SIMD128-NEXT:    i32.const $push46=, 65534
-; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop46
+; NO-SIMD128-NEXT:    i32.add $push15=, $pop14, $pop48
+; NO-SIMD128-NEXT:    i32.const $push47=, 65534
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $pop47
+; NO-SIMD128-NEXT:    i32.const $push46=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push17=, $pop16, $pop46
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop17
+; NO-SIMD128-NEXT:    i32.add $push18=, $4, $12
 ; NO-SIMD128-NEXT:    i32.const $push45=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop45
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop37
-; NO-SIMD128-NEXT:    i32.add $push38=, $1, $9
-; NO-SIMD128-NEXT:    i32.const $push44=, 1
-; NO-SIMD128-NEXT:    i32.add $push39=, $pop38, $pop44
-; NO-SIMD128-NEXT:    i32.const $push43=, 65534
-; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $pop43
+; NO-SIMD128-NEXT:    i32.add $push19=, $pop18, $pop45
+; NO-SIMD128-NEXT:    i32.const $push44=, 65534
+; NO-SIMD128-NEXT:    i32.and $push20=, $pop19, $pop44
+; NO-SIMD128-NEXT:    i32.const $push43=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop43
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop21
+; NO-SIMD128-NEXT:    i32.add $push22=, $3, $11
 ; NO-SIMD128-NEXT:    i32.const $push42=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push41=, $pop40, $pop42
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop41
+; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop42
+; NO-SIMD128-NEXT:    i32.const $push41=, 65534
+; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop41
+; NO-SIMD128-NEXT:    i32.const $push40=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop40
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop25
+; NO-SIMD128-NEXT:    i32.add $push26=, $2, $10
+; NO-SIMD128-NEXT:    i32.const $push39=, 1
+; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop39
+; NO-SIMD128-NEXT:    i32.const $push38=, 65534
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop38
+; NO-SIMD128-NEXT:    i32.const $push37=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop37
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop29
+; NO-SIMD128-NEXT:    i32.add $push30=, $1, $9
+; NO-SIMD128-NEXT:    i32.const $push36=, 1
+; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop36
+; NO-SIMD128-NEXT:    i32.const $push35=, 65534
+; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop35
+; NO-SIMD128-NEXT:    i32.const $push34=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop34
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop33
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: avgr_u_v8i16:
@@ -7078,73 +5814,65 @@ define <8 x i16> @avgr_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $pop0, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 65534
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop2, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop55
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $2, $10
-; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop52
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $3, $11
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $pop16, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push19=, $pop18, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $pop20, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $pop52
 ; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push23=, $pop22, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $4, $12
 ; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop25), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $pop32, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $pop14, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $5, $13
 ; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop31), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $pop18, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $6, $14
 ; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop37), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $pop22, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $pop23, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $pop30, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $pop31, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push33=, $pop32, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop33
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add nuw <8 x i16> %x, %y
   %b = add nuw <8 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -7176,78 +5904,70 @@ define <8 x i16> @avgr_u_v8i16_wrap(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: avgr_u_v8i16_wrap:
 ; NO-SIMD128:         .functype avgr_u_v8i16_wrap (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.add $push2=, $8, $16
-; NO-SIMD128-NEXT:    i32.const $push3=, 1
-; NO-SIMD128-NEXT:    i32.add $push4=, $pop2, $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 65534
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop4, $pop5
-; NO-SIMD128-NEXT:    i32.const $push63=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push7=, $pop6, $pop63
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $pop7
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.add $push10=, $7, $15
-; NO-SIMD128-NEXT:    i32.const $push62=, 1
-; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop62
-; NO-SIMD128-NEXT:    i32.const $push61=, 65534
-; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop61
-; NO-SIMD128-NEXT:    i32.const $push60=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop60
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop13
-; NO-SIMD128-NEXT:    i32.const $push14=, 10
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.add $push16=, $6, $14
-; NO-SIMD128-NEXT:    i32.const $push59=, 1
-; NO-SIMD128-NEXT:    i32.add $push17=, $pop16, $pop59
-; NO-SIMD128-NEXT:    i32.const $push58=, 65534
-; NO-SIMD128-NEXT:    i32.and $push18=, $pop17, $pop58
-; NO-SIMD128-NEXT:    i32.const $push57=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push19=, $pop18, $pop57
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop19
-; NO-SIMD128-NEXT:    i32.add $push20=, $5, $13
-; NO-SIMD128-NEXT:    i32.const $push56=, 1
-; NO-SIMD128-NEXT:    i32.add $push21=, $pop20, $pop56
-; NO-SIMD128-NEXT:    i32.const $push55=, 65534
-; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $pop55
+; NO-SIMD128-NEXT:    i32.add $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.const $push1=, 1
+; NO-SIMD128-NEXT:    i32.add $push2=, $pop0, $pop1
+; NO-SIMD128-NEXT:    i32.const $push3=, 65534
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop2, $pop3
+; NO-SIMD128-NEXT:    i32.const $push55=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push5=, $pop4, $pop55
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $7, $15
 ; NO-SIMD128-NEXT:    i32.const $push54=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push23=, $pop22, $pop54
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop23
-; NO-SIMD128-NEXT:    i32.const $push24=, 6
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.add $push26=, $4, $12
-; NO-SIMD128-NEXT:    i32.const $push53=, 1
-; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop53
-; NO-SIMD128-NEXT:    i32.const $push52=, 65534
-; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop52
+; NO-SIMD128-NEXT:    i32.add $push7=, $pop6, $pop54
+; NO-SIMD128-NEXT:    i32.const $push53=, 65534
+; NO-SIMD128-NEXT:    i32.and $push8=, $pop7, $pop53
+; NO-SIMD128-NEXT:    i32.const $push52=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop52
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop9
+; NO-SIMD128-NEXT:    i32.add $push10=, $6, $14
 ; NO-SIMD128-NEXT:    i32.const $push51=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop51
-; NO-SIMD128-NEXT:    i32.store16 0($pop25), $pop29
-; NO-SIMD128-NEXT:    i32.add $push30=, $3, $11
-; NO-SIMD128-NEXT:    i32.const $push50=, 1
-; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop50
-; NO-SIMD128-NEXT:    i32.const $push49=, 65534
-; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop49
+; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop51
+; NO-SIMD128-NEXT:    i32.const $push50=, 65534
+; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop50
+; NO-SIMD128-NEXT:    i32.const $push49=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop49
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop13
+; NO-SIMD128-NEXT:    i32.add $push14=, $5, $13
 ; NO-SIMD128-NEXT:    i32.const $push48=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop48
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop33
-; NO-SIMD128-NEXT:    i32.add $push34=, $2, $10
-; NO-SIMD128-NEXT:    i32.const $push47=, 1
-; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop47
-; NO-SIMD128-NEXT:    i32.const $push46=, 65534
-; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop46
+; NO-SIMD128-NEXT:    i32.add $push15=, $pop14, $pop48
+; NO-SIMD128-NEXT:    i32.const $push47=, 65534
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $pop47
+; NO-SIMD128-NEXT:    i32.const $push46=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push17=, $pop16, $pop46
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop17
+; NO-SIMD128-NEXT:    i32.add $push18=, $4, $12
 ; NO-SIMD128-NEXT:    i32.const $push45=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop45
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop37
-; NO-SIMD128-NEXT:    i32.add $push38=, $1, $9
-; NO-SIMD128-NEXT:    i32.const $push44=, 1
-; NO-SIMD128-NEXT:    i32.add $push39=, $pop38, $pop44
-; NO-SIMD128-NEXT:    i32.const $push43=, 65534
-; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $pop43
+; NO-SIMD128-NEXT:    i32.add $push19=, $pop18, $pop45
+; NO-SIMD128-NEXT:    i32.const $push44=, 65534
+; NO-SIMD128-NEXT:    i32.and $push20=, $pop19, $pop44
+; NO-SIMD128-NEXT:    i32.const $push43=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop43
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop21
+; NO-SIMD128-NEXT:    i32.add $push22=, $3, $11
 ; NO-SIMD128-NEXT:    i32.const $push42=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push41=, $pop40, $pop42
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop41
+; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop42
+; NO-SIMD128-NEXT:    i32.const $push41=, 65534
+; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop41
+; NO-SIMD128-NEXT:    i32.const $push40=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop40
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop25
+; NO-SIMD128-NEXT:    i32.add $push26=, $2, $10
+; NO-SIMD128-NEXT:    i32.const $push39=, 1
+; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop39
+; NO-SIMD128-NEXT:    i32.const $push38=, 65534
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop38
+; NO-SIMD128-NEXT:    i32.const $push37=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop37
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop29
+; NO-SIMD128-NEXT:    i32.add $push30=, $1, $9
+; NO-SIMD128-NEXT:    i32.const $push36=, 1
+; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop36
+; NO-SIMD128-NEXT:    i32.const $push35=, 65534
+; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop35
+; NO-SIMD128-NEXT:    i32.const $push34=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop34
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop33
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: avgr_u_v8i16_wrap:
@@ -7258,73 +5978,65 @@ define <8 x i16> @avgr_u_v8i16_wrap(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $pop0, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 65534
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop2, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop55
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $2, $10
-; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop52
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $3, $11
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $pop16, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push19=, $pop18, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $pop20, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $pop52
 ; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push23=, $pop22, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $4, $12
 ; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop25), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $pop32, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $pop14, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $5, $13
 ; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop31), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $pop18, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $6, $14
 ; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop37), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $pop22, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $pop23, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $pop30, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $pop31, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push33=, $pop32, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop33
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add <8 x i16> %x, %y
   %b = add <8 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -7348,70 +6060,62 @@ define <8 x i16> @abs_v8i16(<8 x i16> %x) {
 ; NO-SIMD128-LABEL: abs_v8i16:
 ; NO-SIMD128:         .functype abs_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 14
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $8
 ; NO-SIMD128-NEXT:    i32.const $push1=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push55=, $pop0, $pop1
-; NO-SIMD128-NEXT:    local.tee $push54=, $9=, $pop55
-; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $pop54
+; NO-SIMD128-NEXT:    i32.shr_s $push47=, $pop0, $pop1
+; NO-SIMD128-NEXT:    local.tee $push46=, $9=, $pop47
+; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $pop46
 ; NO-SIMD128-NEXT:    i32.sub $push3=, $pop2, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $7
-; NO-SIMD128-NEXT:    i32.const $push53=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push52=, $pop6, $pop53
-; NO-SIMD128-NEXT:    local.tee $push51=, $8=, $pop52
-; NO-SIMD128-NEXT:    i32.xor $push7=, $7, $pop51
-; NO-SIMD128-NEXT:    i32.sub $push8=, $pop7, $8
-; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push14=, 10
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.extend16_s $push11=, $6
-; NO-SIMD128-NEXT:    i32.const $push50=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push49=, $pop11, $pop50
-; NO-SIMD128-NEXT:    local.tee $push48=, $8=, $pop49
-; NO-SIMD128-NEXT:    i32.xor $push12=, $6, $pop48
-; NO-SIMD128-NEXT:    i32.sub $push13=, $pop12, $8
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
-; NO-SIMD128-NEXT:    i32.extend16_s $push16=, $5
-; NO-SIMD128-NEXT:    i32.const $push47=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push46=, $pop16, $pop47
-; NO-SIMD128-NEXT:    local.tee $push45=, $8=, $pop46
-; NO-SIMD128-NEXT:    i32.xor $push17=, $5, $pop45
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $7
+; NO-SIMD128-NEXT:    i32.const $push45=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push44=, $pop4, $pop45
+; NO-SIMD128-NEXT:    local.tee $push43=, $8=, $pop44
+; NO-SIMD128-NEXT:    i32.xor $push5=, $7, $pop43
+; NO-SIMD128-NEXT:    i32.sub $push6=, $pop5, $8
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $6
+; NO-SIMD128-NEXT:    i32.const $push42=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push41=, $pop7, $pop42
+; NO-SIMD128-NEXT:    local.tee $push40=, $8=, $pop41
+; NO-SIMD128-NEXT:    i32.xor $push8=, $6, $pop40
+; NO-SIMD128-NEXT:    i32.sub $push9=, $pop8, $8
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop9
+; NO-SIMD128-NEXT:    i32.extend16_s $push10=, $5
+; NO-SIMD128-NEXT:    i32.const $push39=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push38=, $pop10, $pop39
+; NO-SIMD128-NEXT:    local.tee $push37=, $8=, $pop38
+; NO-SIMD128-NEXT:    i32.xor $push11=, $5, $pop37
+; NO-SIMD128-NEXT:    i32.sub $push12=, $pop11, $8
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $4
+; NO-SIMD128-NEXT:    i32.const $push36=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push35=, $pop13, $pop36
+; NO-SIMD128-NEXT:    local.tee $push34=, $8=, $pop35
+; NO-SIMD128-NEXT:    i32.xor $push14=, $4, $pop34
+; NO-SIMD128-NEXT:    i32.sub $push15=, $pop14, $8
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend16_s $push16=, $3
+; NO-SIMD128-NEXT:    i32.const $push33=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push32=, $pop16, $pop33
+; NO-SIMD128-NEXT:    local.tee $push31=, $8=, $pop32
+; NO-SIMD128-NEXT:    i32.xor $push17=, $3, $pop31
 ; NO-SIMD128-NEXT:    i32.sub $push18=, $pop17, $8
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop18
-; NO-SIMD128-NEXT:    i32.const $push22=, 6
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $4
-; NO-SIMD128-NEXT:    i32.const $push44=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push43=, $pop19, $pop44
-; NO-SIMD128-NEXT:    local.tee $push42=, $8=, $pop43
-; NO-SIMD128-NEXT:    i32.xor $push20=, $4, $pop42
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop18
+; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $2
+; NO-SIMD128-NEXT:    i32.const $push30=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push29=, $pop19, $pop30
+; NO-SIMD128-NEXT:    local.tee $push28=, $8=, $pop29
+; NO-SIMD128-NEXT:    i32.xor $push20=, $2, $pop28
 ; NO-SIMD128-NEXT:    i32.sub $push21=, $pop20, $8
-; NO-SIMD128-NEXT:    i32.store16 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.extend16_s $push24=, $3
-; NO-SIMD128-NEXT:    i32.const $push41=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push40=, $pop24, $pop41
-; NO-SIMD128-NEXT:    local.tee $push39=, $8=, $pop40
-; NO-SIMD128-NEXT:    i32.xor $push25=, $3, $pop39
-; NO-SIMD128-NEXT:    i32.sub $push26=, $pop25, $8
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop26
-; NO-SIMD128-NEXT:    i32.extend16_s $push27=, $2
-; NO-SIMD128-NEXT:    i32.const $push38=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push37=, $pop27, $pop38
-; NO-SIMD128-NEXT:    local.tee $push36=, $8=, $pop37
-; NO-SIMD128-NEXT:    i32.xor $push28=, $2, $pop36
-; NO-SIMD128-NEXT:    i32.sub $push29=, $pop28, $8
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop29
-; NO-SIMD128-NEXT:    i32.extend16_s $push30=, $1
-; NO-SIMD128-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push34=, $pop30, $pop35
-; NO-SIMD128-NEXT:    local.tee $push33=, $8=, $pop34
-; NO-SIMD128-NEXT:    i32.xor $push31=, $1, $pop33
-; NO-SIMD128-NEXT:    i32.sub $push32=, $pop31, $8
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop21
+; NO-SIMD128-NEXT:    i32.extend16_s $push22=, $1
+; NO-SIMD128-NEXT:    i32.const $push27=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push26=, $pop22, $pop27
+; NO-SIMD128-NEXT:    local.tee $push25=, $8=, $pop26
+; NO-SIMD128-NEXT:    i32.xor $push23=, $1, $pop25
+; NO-SIMD128-NEXT:    i32.sub $push24=, $pop23, $8
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop24
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: abs_v8i16:
@@ -7419,68 +6123,60 @@ define <8 x i16> @abs_v8i16(<8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push0=, $1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push1=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push55=, $pop0, $pop1
-; NO-SIMD128-FAST-NEXT:    local.tee $push54=, $9=, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push47=, $pop0, $pop1
+; NO-SIMD128-FAST-NEXT:    local.tee $push46=, $9=, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop46
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop2, $9
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push4=, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push52=, $pop4, $pop53
-; NO-SIMD128-FAST-NEXT:    local.tee $push51=, $1=, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push44=, $pop4, $pop45
+; NO-SIMD128-FAST-NEXT:    local.tee $push43=, $1=, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop43
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push7=, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push49=, $pop7, $pop50
-; NO-SIMD128-FAST-NEXT:    local.tee $push48=, $2=, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $3, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push41=, $pop7, $pop42
+; NO-SIMD128-FAST-NEXT:    local.tee $push40=, $2=, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $3, $pop40
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $pop8, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push10=, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push46=, $pop10, $pop47
-; NO-SIMD128-FAST-NEXT:    local.tee $push45=, $3=, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $4, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push38=, $pop10, $pop39
+; NO-SIMD128-FAST-NEXT:    local.tee $push37=, $3=, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $4, $pop37
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $pop11, $3
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push15=, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push43=, $pop15, $pop44
-; NO-SIMD128-FAST-NEXT:    local.tee $push42=, $4=, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $5, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.sub $push17=, $pop16, $4
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push18=, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push40=, $pop18, $pop41
-; NO-SIMD128-FAST-NEXT:    local.tee $push39=, $5=, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $6, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.sub $push20=, $pop19, $5
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push37=, $pop23, $pop38
-; NO-SIMD128-FAST-NEXT:    local.tee $push36=, $6=, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $7, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.sub $push25=, $pop24, $6
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push34=, $pop28, $pop35
-; NO-SIMD128-FAST-NEXT:    local.tee $push33=, $0=, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $8, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.sub $push30=, $pop29, $0
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push35=, $pop13, $pop36
+; NO-SIMD128-FAST-NEXT:    local.tee $push34=, $4=, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $5, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $pop14, $4
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push16=, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push32=, $pop16, $pop33
+; NO-SIMD128-FAST-NEXT:    local.tee $push31=, $5=, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $6, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.sub $push18=, $pop17, $5
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push19=, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push29=, $pop19, $pop30
+; NO-SIMD128-FAST-NEXT:    local.tee $push28=, $6=, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $7, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.sub $push21=, $pop20, $6
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push22=, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push26=, $pop22, $pop27
+; NO-SIMD128-FAST-NEXT:    local.tee $push25=, $7=, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $8, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.sub $push24=, $pop23, $7
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop24
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <8 x i16> zeroinitializer, %x
   %b = icmp slt <8 x i16> %x, zeroinitializer
@@ -7505,37 +6201,29 @@ define <8 x i16> @neg_v8i16(<8 x i16> %x) {
 ; NO-SIMD128:         .functype neg_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $5
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push23=, 0
-; NO-SIMD128-NEXT:    i32.sub $push2=, $pop23, $3
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push22=, 0
-; NO-SIMD128-NEXT:    i32.sub $push3=, $pop22, $2
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push21=, 0
-; NO-SIMD128-NEXT:    i32.sub $push4=, $pop21, $1
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 14
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.const $push20=, 0
-; NO-SIMD128-NEXT:    i32.sub $push5=, $pop20, $8
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.const $push19=, 0
-; NO-SIMD128-NEXT:    i32.sub $push8=, $pop19, $7
-; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 10
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.const $push18=, 0
-; NO-SIMD128-NEXT:    i32.sub $push11=, $pop18, $6
-; NO-SIMD128-NEXT:    i32.store16 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 6
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push17=, 0
-; NO-SIMD128-NEXT:    i32.sub $push14=, $pop17, $4
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $8
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push15=, 0
+; NO-SIMD128-NEXT:    i32.sub $push2=, $pop15, $7
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push14=, 0
+; NO-SIMD128-NEXT:    i32.sub $push3=, $pop14, $6
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push13=, 0
+; NO-SIMD128-NEXT:    i32.sub $push4=, $pop13, $5
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push12=, 0
+; NO-SIMD128-NEXT:    i32.sub $push5=, $pop12, $4
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push11=, 0
+; NO-SIMD128-NEXT:    i32.sub $push6=, $pop11, $3
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push10=, 0
+; NO-SIMD128-NEXT:    i32.sub $push7=, $pop10, $2
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push9=, 0
+; NO-SIMD128-NEXT:    i32.sub $push8=, $pop9, $1
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: neg_v8i16:
@@ -7544,35 +6232,27 @@ define <8 x i16> @neg_v8i16(<8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 0
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push1=, $pop0, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop23, $2
+; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop15, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop22, $3
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop14, $3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop21, $4
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $pop20, $5
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push10=, $pop19, $6
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push13=, $pop18, $7
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push16=, $pop17, $8
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $pop13, $4
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push12=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $pop12, $5
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop11, $6
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $pop10, $7
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push8=, $pop9, $8
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>,
                      %x
@@ -7596,64 +6276,48 @@ define <8 x i16> @shl_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128:         .functype shl_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push18=, $9, $pop0
-; NO-SIMD128-NEXT:    local.tee $push17=, $9=, $pop18
-; NO-SIMD128-NEXT:    i32.shl $push1=, $5, $pop17
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.shl $push2=, $3, $9
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.shl $push3=, $2, $9
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.shl $push4=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 14
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.shl $push5=, $8, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.shl $push8=, $7, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 10
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.shl $push11=, $6, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 6
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.shl $push14=, $4, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.and $push10=, $9, $pop0
+; NO-SIMD128-NEXT:    local.tee $push9=, $9=, $pop10
+; NO-SIMD128-NEXT:    i32.shl $push1=, $8, $pop9
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.shl $push2=, $7, $9
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.shl $push3=, $6, $9
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop3
+; NO-SIMD128-NEXT:    i32.shl $push4=, $5, $9
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.shl $push5=, $4, $9
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop5
+; NO-SIMD128-NEXT:    i32.shl $push6=, $3, $9
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.shl $push7=, $2, $9
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop7
+; NO-SIMD128-NEXT:    i32.shl $push8=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_v8i16:
 ; NO-SIMD128-FAST:         .functype shl_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $9, $pop0
-; NO-SIMD128-FAST-NEXT:    local.tee $push17=, $9=, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $2, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $9, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push9=, $9=, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $2, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $1, $9
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $9
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $5, $9
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $6, $9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $7, $9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $4, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $5, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $6, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $7, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $8, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <8 x i16> undef, i16 %x, i32 0
   %s = shufflevector <8 x i16> %t, <8 x i16> undef,
@@ -7681,37 +6345,29 @@ define <8 x i16> @shl_const_v8i16(<8 x i16> %v) {
 ; NO-SIMD128:         .functype shl_const_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 5
-; NO-SIMD128-NEXT:    i32.shl $push1=, $5, $pop0
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push23=, 5
-; NO-SIMD128-NEXT:    i32.shl $push2=, $3, $pop23
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push22=, 5
-; NO-SIMD128-NEXT:    i32.shl $push3=, $2, $pop22
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push21=, 5
-; NO-SIMD128-NEXT:    i32.shl $push4=, $1, $pop21
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 14
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.const $push20=, 5
-; NO-SIMD128-NEXT:    i32.shl $push5=, $8, $pop20
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.const $push19=, 5
-; NO-SIMD128-NEXT:    i32.shl $push8=, $7, $pop19
-; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 10
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.shl $push11=, $6, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 6
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push17=, 5
-; NO-SIMD128-NEXT:    i32.shl $push14=, $4, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.shl $push1=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push15=, 5
+; NO-SIMD128-NEXT:    i32.shl $push2=, $7, $pop15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push14=, 5
+; NO-SIMD128-NEXT:    i32.shl $push3=, $6, $pop14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push13=, 5
+; NO-SIMD128-NEXT:    i32.shl $push4=, $5, $pop13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push12=, 5
+; NO-SIMD128-NEXT:    i32.shl $push5=, $4, $pop12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push11=, 5
+; NO-SIMD128-NEXT:    i32.shl $push6=, $3, $pop11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push10=, 5
+; NO-SIMD128-NEXT:    i32.shl $push7=, $2, $pop10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push9=, 5
+; NO-SIMD128-NEXT:    i32.shl $push8=, $1, $pop9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_const_v8i16:
@@ -7720,35 +6376,27 @@ define <8 x i16> @shl_const_v8i16(<8 x i16> %v) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 5
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop15
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop14
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $5, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $6, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $7, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $4, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push12=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $5, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $7, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $8, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <8 x i16> %v,
     <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
@@ -7866,45 +6514,37 @@ define <8 x i16> @shl_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128:         .functype shl_vec_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $13, $pop0
-; NO-SIMD128-NEXT:    i32.shl $push2=, $5, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push31=, 65535
-; NO-SIMD128-NEXT:    i32.and $push3=, $11, $pop31
-; NO-SIMD128-NEXT:    i32.shl $push4=, $3, $pop3
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push30=, 65535
-; NO-SIMD128-NEXT:    i32.and $push5=, $10, $pop30
-; NO-SIMD128-NEXT:    i32.shl $push6=, $2, $pop5
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push29=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $9, $pop29
-; NO-SIMD128-NEXT:    i32.shl $push8=, $1, $pop7
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push28=, 65535
-; NO-SIMD128-NEXT:    i32.and $push9=, $16, $pop28
-; NO-SIMD128-NEXT:    i32.shl $push10=, $8, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push27=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $15, $pop27
-; NO-SIMD128-NEXT:    i32.shl $push14=, $7, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push19=, 10
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push26=, 65535
-; NO-SIMD128-NEXT:    i32.and $push17=, $14, $pop26
-; NO-SIMD128-NEXT:    i32.shl $push18=, $6, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push23=, 6
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.const $push25=, 65535
-; NO-SIMD128-NEXT:    i32.and $push21=, $12, $pop25
-; NO-SIMD128-NEXT:    i32.shl $push22=, $4, $pop21
-; NO-SIMD128-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.shl $push2=, $8, $pop1
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push23=, 65535
+; NO-SIMD128-NEXT:    i32.and $push3=, $15, $pop23
+; NO-SIMD128-NEXT:    i32.shl $push4=, $7, $pop3
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push22=, 65535
+; NO-SIMD128-NEXT:    i32.and $push5=, $14, $pop22
+; NO-SIMD128-NEXT:    i32.shl $push6=, $6, $pop5
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push21=, 65535
+; NO-SIMD128-NEXT:    i32.and $push7=, $13, $pop21
+; NO-SIMD128-NEXT:    i32.shl $push8=, $5, $pop7
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push20=, 65535
+; NO-SIMD128-NEXT:    i32.and $push9=, $12, $pop20
+; NO-SIMD128-NEXT:    i32.shl $push10=, $4, $pop9
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-NEXT:    i32.and $push11=, $11, $pop19
+; NO-SIMD128-NEXT:    i32.shl $push12=, $3, $pop11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push18=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $10, $pop18
+; NO-SIMD128-NEXT:    i32.shl $push14=, $2, $pop13
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push17=, 65535
+; NO-SIMD128-NEXT:    i32.and $push15=, $9, $pop17
+; NO-SIMD128-NEXT:    i32.shl $push16=, $1, $pop15
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_vec_v8i16:
@@ -7914,42 +6554,34 @@ define <8 x i16> @shl_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $10, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $10, $pop23
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $2, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $11, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $11, $pop22
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $3, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $12, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $4, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $13, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $5, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $14, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $6, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $15, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.shl $push20=, $7, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $16, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.shl $push24=, $8, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $12, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $13, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $5, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $14, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $15, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.shl $push14=, $7, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $16, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <8 x i16> %v, %x
   ret <8 x i16> %a
@@ -7971,41 +6603,33 @@ define <8 x i16> @shr_s_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-LABEL: shr_s_v8i16:
 ; NO-SIMD128:         .functype shr_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $8
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push26=, $9, $pop0
-; NO-SIMD128-NEXT:    local.tee $push25=, $9=, $pop26
-; NO-SIMD128-NEXT:    i32.shr_s $push2=, $pop1, $pop25
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $3
+; NO-SIMD128-NEXT:    i32.and $push18=, $9, $pop0
+; NO-SIMD128-NEXT:    local.tee $push17=, $9=, $pop18
+; NO-SIMD128-NEXT:    i32.shr_s $push2=, $pop1, $pop17
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $7
 ; NO-SIMD128-NEXT:    i32.shr_s $push4=, $pop3, $9
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $2
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $6
 ; NO-SIMD128-NEXT:    i32.shr_s $push6=, $pop5, $9
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $1
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $5
 ; NO-SIMD128-NEXT:    i32.shr_s $push8=, $pop7, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $8
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $4
 ; NO-SIMD128-NEXT:    i32.shr_s $push10=, $pop9, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $7
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.extend16_s $push11=, $3
+; NO-SIMD128-NEXT:    i32.shr_s $push12=, $pop11, $9
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $2
 ; NO-SIMD128-NEXT:    i32.shr_s $push14=, $pop13, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push19=, 10
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.extend16_s $push17=, $6
-; NO-SIMD128-NEXT:    i32.shr_s $push18=, $pop17, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push23=, 6
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.extend16_s $push21=, $4
-; NO-SIMD128-NEXT:    i32.shr_s $push22=, $pop21, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.extend16_s $push15=, $1
+; NO-SIMD128-NEXT:    i32.shr_s $push16=, $pop15, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_v8i16:
@@ -8013,9 +6637,9 @@ define <8 x i16> @shr_s_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push1=, $1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $9, $pop0
-; NO-SIMD128-FAST-NEXT:    local.tee $push25=, $1=, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $pop1, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $9, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push17=, $1=, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $pop1, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push3=, $2
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push4=, $pop3, $1
@@ -8023,29 +6647,21 @@ define <8 x i16> @shr_s_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push5=, $3
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push9=, $4
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push7=, $4
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push8=, $pop7, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push9=, $5
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push10=, $pop9, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $5
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $6
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $pop11, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push15=, $6
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push14=, $pop13, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push15=, $8
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push16=, $pop15, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push19=, $7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push20=, $pop19, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $8
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push24=, $pop23, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <8 x i16> undef, i16 %x, i32 0
   %s = shufflevector <8 x i16> %t, <8 x i16> undef,
@@ -8164,54 +6780,46 @@ define <8 x i16> @shr_s_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128-LABEL: shr_s_vec_v8i16:
 ; NO-SIMD128:         .functype shr_s_vec_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend16_s $push2=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push2=, $8
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $13, $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop0
 ; NO-SIMD128-NEXT:    i32.shr_s $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $3
-; NO-SIMD128-NEXT:    i32.const $push39=, 65535
-; NO-SIMD128-NEXT:    i32.and $push4=, $11, $pop39
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $7
+; NO-SIMD128-NEXT:    i32.const $push31=, 65535
+; NO-SIMD128-NEXT:    i32.and $push4=, $15, $pop31
 ; NO-SIMD128-NEXT:    i32.shr_s $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.extend16_s $push8=, $2
-; NO-SIMD128-NEXT:    i32.const $push38=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $10, $pop38
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend16_s $push8=, $6
+; NO-SIMD128-NEXT:    i32.const $push30=, 65535
+; NO-SIMD128-NEXT:    i32.and $push7=, $14, $pop30
 ; NO-SIMD128-NEXT:    i32.shr_s $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.extend16_s $push11=, $1
-; NO-SIMD128-NEXT:    i32.const $push37=, 65535
-; NO-SIMD128-NEXT:    i32.and $push10=, $9, $pop37
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop9
+; NO-SIMD128-NEXT:    i32.extend16_s $push11=, $5
+; NO-SIMD128-NEXT:    i32.const $push29=, 65535
+; NO-SIMD128-NEXT:    i32.and $push10=, $13, $pop29
 ; NO-SIMD128-NEXT:    i32.shr_s $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 14
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.extend16_s $push14=, $8
-; NO-SIMD128-NEXT:    i32.const $push36=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $16, $pop36
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend16_s $push14=, $4
+; NO-SIMD128-NEXT:    i32.const $push28=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $12, $pop28
 ; NO-SIMD128-NEXT:    i32.shr_s $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push21=, 12
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $7
-; NO-SIMD128-NEXT:    i32.const $push35=, 65535
-; NO-SIMD128-NEXT:    i32.and $push18=, $15, $pop35
-; NO-SIMD128-NEXT:    i32.shr_s $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-NEXT:    i32.extend16_s $push24=, $6
-; NO-SIMD128-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-NEXT:    i32.and $push23=, $14, $pop34
-; NO-SIMD128-NEXT:    i32.shr_s $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $4
-; NO-SIMD128-NEXT:    i32.const $push33=, 65535
-; NO-SIMD128-NEXT:    i32.and $push28=, $12, $pop33
-; NO-SIMD128-NEXT:    i32.shr_s $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend16_s $push17=, $3
+; NO-SIMD128-NEXT:    i32.const $push27=, 65535
+; NO-SIMD128-NEXT:    i32.and $push16=, $11, $pop27
+; NO-SIMD128-NEXT:    i32.shr_s $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop18
+; NO-SIMD128-NEXT:    i32.extend16_s $push20=, $2
+; NO-SIMD128-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-NEXT:    i32.and $push19=, $10, $pop26
+; NO-SIMD128-NEXT:    i32.shr_s $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop21
+; NO-SIMD128-NEXT:    i32.extend16_s $push23=, $1
+; NO-SIMD128-NEXT:    i32.const $push25=, 65535
+; NO-SIMD128-NEXT:    i32.and $push22=, $9, $pop25
+; NO-SIMD128-NEXT:    i32.shr_s $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop24
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_vec_v8i16:
@@ -8223,48 +6831,40 @@ define <8 x i16> @shr_s_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push5=, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop31
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push6=, $pop5, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push8=, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop30
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push9=, $pop8, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $12, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push14=, $pop13, $pop12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop14
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push16=, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $13, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push21=, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $14, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push22=, $pop21, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop19), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push26=, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $15, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push27=, $pop26, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop24), $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push31=, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $16, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push32=, $pop31, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop29), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $4
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $12, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push14=, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $13, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push17=, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $14, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push20=, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $15, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $16, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop24
 ; NO-SIMD128-FAST-NEXT:    return
   %a = ashr <8 x i16> %v, %x
   ret <8 x i16> %a
@@ -8287,48 +6887,40 @@ define <8 x i16> @shr_u_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128:         .functype shr_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $5, $pop0
-; NO-SIMD128-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-NEXT:    i32.and $push33=, $9, $pop34
-; NO-SIMD128-NEXT:    local.tee $push32=, $9=, $pop33
-; NO-SIMD128-NEXT:    i32.shr_u $push2=, $pop1, $pop32
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push31=, 65535
-; NO-SIMD128-NEXT:    i32.and $push3=, $3, $pop31
+; NO-SIMD128-NEXT:    i32.and $push1=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-NEXT:    i32.and $push25=, $9, $pop26
+; NO-SIMD128-NEXT:    local.tee $push24=, $9=, $pop25
+; NO-SIMD128-NEXT:    i32.shr_u $push2=, $pop1, $pop24
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push23=, 65535
+; NO-SIMD128-NEXT:    i32.and $push3=, $7, $pop23
 ; NO-SIMD128-NEXT:    i32.shr_u $push4=, $pop3, $9
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push30=, 65535
-; NO-SIMD128-NEXT:    i32.and $push5=, $2, $pop30
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push22=, 65535
+; NO-SIMD128-NEXT:    i32.and $push5=, $6, $pop22
 ; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $9
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push29=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $1, $pop29
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push21=, 65535
+; NO-SIMD128-NEXT:    i32.and $push7=, $5, $pop21
 ; NO-SIMD128-NEXT:    i32.shr_u $push8=, $pop7, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push28=, 65535
-; NO-SIMD128-NEXT:    i32.and $push9=, $8, $pop28
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push20=, 65535
+; NO-SIMD128-NEXT:    i32.and $push9=, $4, $pop20
 ; NO-SIMD128-NEXT:    i32.shr_u $push10=, $pop9, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push27=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $7, $pop27
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-NEXT:    i32.and $push11=, $3, $pop19
+; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $9
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push18=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $2, $pop18
 ; NO-SIMD128-NEXT:    i32.shr_u $push14=, $pop13, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push19=, 10
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push26=, 65535
-; NO-SIMD128-NEXT:    i32.and $push17=, $6, $pop26
-; NO-SIMD128-NEXT:    i32.shr_u $push18=, $pop17, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push23=, 6
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.const $push25=, 65535
-; NO-SIMD128-NEXT:    i32.and $push21=, $4, $pop25
-; NO-SIMD128-NEXT:    i32.shr_u $push22=, $pop21, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push17=, 65535
+; NO-SIMD128-NEXT:    i32.and $push15=, $1, $pop17
+; NO-SIMD128-NEXT:    i32.shr_u $push16=, $pop15, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_v8i16:
@@ -8336,47 +6928,39 @@ define <8 x i16> @shr_u_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $9, $pop34
-; NO-SIMD128-FAST-NEXT:    local.tee $push32=, $1=, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $pop1, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $9, $pop26
+; NO-SIMD128-FAST-NEXT:    local.tee $push24=, $1=, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $pop1, $pop24
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $2, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $2, $pop23
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push4=, $pop3, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $3, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $3, $pop22
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $4, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $4, $pop21
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push8=, $pop7, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $5, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $5, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push10=, $pop9, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $6, $pop19
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $6, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $7, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push14=, $pop13, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $7, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push18=, $pop17, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop20), $pop18
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $8, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push22=, $pop21, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $8, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push16=, $pop15, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <8 x i16> undef, i16 %x, i32 0
   %s = shufflevector <8 x i16> %t, <8 x i16> undef,
@@ -8496,61 +7080,53 @@ define <8 x i16> @shr_u_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128:         .functype shr_u_vec_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push2=, $5, $pop0
-; NO-SIMD128-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $13, $pop47
-; NO-SIMD128-NEXT:    i32.shr_u $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-NEXT:    i32.and $push5=, $3, $pop46
-; NO-SIMD128-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-NEXT:    i32.and $push4=, $11, $pop45
-; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-NEXT:    i32.and $push8=, $2, $pop44
-; NO-SIMD128-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $10, $pop43
-; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-NEXT:    i32.and $push11=, $1, $pop42
-; NO-SIMD128-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-NEXT:    i32.and $push10=, $9, $pop41
-; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 14
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push40=, 65535
-; NO-SIMD128-NEXT:    i32.and $push14=, $8, $pop40
+; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push39=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $16, $pop39
-; NO-SIMD128-NEXT:    i32.shr_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push21=, 12
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop39
+; NO-SIMD128-NEXT:    i32.shr_u $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push38=, 65535
-; NO-SIMD128-NEXT:    i32.and $push19=, $7, $pop38
+; NO-SIMD128-NEXT:    i32.and $push5=, $7, $pop38
 ; NO-SIMD128-NEXT:    i32.const $push37=, 65535
-; NO-SIMD128-NEXT:    i32.and $push18=, $15, $pop37
-; NO-SIMD128-NEXT:    i32.shr_u $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-NEXT:    i32.and $push4=, $15, $pop37
+; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push36=, 65535
-; NO-SIMD128-NEXT:    i32.and $push24=, $6, $pop36
+; NO-SIMD128-NEXT:    i32.and $push8=, $6, $pop36
 ; NO-SIMD128-NEXT:    i32.const $push35=, 65535
-; NO-SIMD128-NEXT:    i32.and $push23=, $14, $pop35
-; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-NEXT:    i32.and $push7=, $14, $pop35
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop9
 ; NO-SIMD128-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-NEXT:    i32.and $push29=, $4, $pop34
+; NO-SIMD128-NEXT:    i32.and $push11=, $5, $pop34
 ; NO-SIMD128-NEXT:    i32.const $push33=, 65535
-; NO-SIMD128-NEXT:    i32.and $push28=, $12, $pop33
-; NO-SIMD128-NEXT:    i32.shr_u $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-NEXT:    i32.and $push10=, $13, $pop33
+; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push32=, 65535
+; NO-SIMD128-NEXT:    i32.and $push14=, $4, $pop32
+; NO-SIMD128-NEXT:    i32.const $push31=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $12, $pop31
+; NO-SIMD128-NEXT:    i32.shr_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push30=, 65535
+; NO-SIMD128-NEXT:    i32.and $push17=, $3, $pop30
+; NO-SIMD128-NEXT:    i32.const $push29=, 65535
+; NO-SIMD128-NEXT:    i32.and $push16=, $11, $pop29
+; NO-SIMD128-NEXT:    i32.shr_u $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push28=, 65535
+; NO-SIMD128-NEXT:    i32.and $push20=, $2, $pop28
+; NO-SIMD128-NEXT:    i32.const $push27=, 65535
+; NO-SIMD128-NEXT:    i32.and $push19=, $10, $pop27
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop21
+; NO-SIMD128-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-NEXT:    i32.and $push23=, $1, $pop26
+; NO-SIMD128-NEXT:    i32.const $push25=, 65535
+; NO-SIMD128-NEXT:    i32.and $push22=, $9, $pop25
+; NO-SIMD128-NEXT:    i32.shr_u $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop24
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_vec_v8i16:
@@ -8558,60 +7134,52 @@ define <8 x i16> @shr_u_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop39
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $12, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $5, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $13, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
 ; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $6, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $14, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $7, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop36
 ; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $15, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop34
 ; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $16, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $12, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $5, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $13, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $6, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $14, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $7, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $15, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $8, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $16, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop24
 ; NO-SIMD128-FAST-NEXT:    return
   %a = lshr <8 x i16> %v, %x
   ret <8 x i16> %a
@@ -8633,30 +7201,22 @@ define <8 x i16> @and_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: and_v8i16:
 ; NO-SIMD128:         .functype and_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.and $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.and $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.and $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.and $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.and $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.and $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.and $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.and $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.and $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.and $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.and $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.and $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.and $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.and $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.and $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: and_v8i16:
@@ -8668,24 +7228,16 @@ define <8 x i16> @and_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = and <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -8707,30 +7259,22 @@ define <8 x i16> @or_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: or_v8i16:
 ; NO-SIMD128:         .functype or_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.or $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.or $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.or $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.or $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.or $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.or $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.or $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.or $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.or $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.or $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.or $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.or $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.or $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.or $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.or $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.or $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: or_v8i16:
@@ -8742,24 +7286,16 @@ define <8 x i16> @or_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.or $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.or $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.or $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.or $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.or $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.or $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.or $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = or <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -8781,30 +7317,22 @@ define <8 x i16> @xor_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: xor_v8i16:
 ; NO-SIMD128:         .functype xor_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.xor $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.xor $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.xor $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.xor $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.xor $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.xor $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.xor $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.xor $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.xor $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.xor $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.xor $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: xor_v8i16:
@@ -8816,24 +7344,16 @@ define <8 x i16> @xor_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -8856,37 +7376,29 @@ define <8 x i16> @not_v8i16(<8 x i16> %x) {
 ; NO-SIMD128:         .functype not_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $5, $pop0
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push23=, -1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $3, $pop23
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push22=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $2, $pop22
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-NEXT:    i32.xor $push4=, $1, $pop21
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 14
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-NEXT:    i32.xor $push5=, $8, $pop20
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-NEXT:    i32.xor $push8=, $7, $pop19
-; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 10
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.const $push18=, -1
-; NO-SIMD128-NEXT:    i32.xor $push11=, $6, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 6
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push17=, -1
-; NO-SIMD128-NEXT:    i32.xor $push14=, $4, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push1=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push15=, -1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $7, $pop15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push14=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $6, $pop14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push13=, -1
+; NO-SIMD128-NEXT:    i32.xor $push4=, $5, $pop13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push12=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $4, $pop12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push11=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $3, $pop11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push10=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $2, $pop10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push9=, -1
+; NO-SIMD128-NEXT:    i32.xor $push8=, $1, $pop9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: not_v8i16:
@@ -8895,35 +7407,27 @@ define <8 x i16> @not_v8i16(<8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.const $push15=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop15
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop14
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $4, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $5, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $6, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $7, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $8, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $4, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push12=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $5, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $7, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $8, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <8 x i16> %x, <i16 -1, i16 -1, i16 -1, i16 -1,
                           i16 -1, i16 -1, i16 -1, i16 -1>
@@ -8948,45 +7452,37 @@ define <8 x i16> @andnot_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128:         .functype andnot_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $13, $pop0
-; NO-SIMD128-NEXT:    i32.and $push2=, $5, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push31=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $11, $pop31
-; NO-SIMD128-NEXT:    i32.and $push4=, $3, $pop3
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push30=, -1
-; NO-SIMD128-NEXT:    i32.xor $push5=, $10, $pop30
-; NO-SIMD128-NEXT:    i32.and $push6=, $2, $pop5
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push29=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $9, $pop29
-; NO-SIMD128-NEXT:    i32.and $push8=, $1, $pop7
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push28=, -1
-; NO-SIMD128-NEXT:    i32.xor $push9=, $16, $pop28
-; NO-SIMD128-NEXT:    i32.and $push10=, $8, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push27=, -1
-; NO-SIMD128-NEXT:    i32.xor $push13=, $15, $pop27
-; NO-SIMD128-NEXT:    i32.and $push14=, $7, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push19=, 10
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push26=, -1
-; NO-SIMD128-NEXT:    i32.xor $push17=, $14, $pop26
-; NO-SIMD128-NEXT:    i32.and $push18=, $6, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push23=, 6
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.const $push25=, -1
-; NO-SIMD128-NEXT:    i32.xor $push21=, $12, $pop25
-; NO-SIMD128-NEXT:    i32.and $push22=, $4, $pop21
-; NO-SIMD128-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-NEXT:    i32.xor $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop1
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push23=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $15, $pop23
+; NO-SIMD128-NEXT:    i32.and $push4=, $7, $pop3
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push22=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $14, $pop22
+; NO-SIMD128-NEXT:    i32.and $push6=, $6, $pop5
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push21=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $13, $pop21
+; NO-SIMD128-NEXT:    i32.and $push8=, $5, $pop7
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push20=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $12, $pop20
+; NO-SIMD128-NEXT:    i32.and $push10=, $4, $pop9
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-NEXT:    i32.xor $push11=, $11, $pop19
+; NO-SIMD128-NEXT:    i32.and $push12=, $3, $pop11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $10, $pop18
+; NO-SIMD128-NEXT:    i32.and $push14=, $2, $pop13
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-NEXT:    i32.xor $push15=, $9, $pop17
+; NO-SIMD128-NEXT:    i32.and $push16=, $1, $pop15
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: andnot_v8i16:
@@ -8996,42 +7492,34 @@ define <8 x i16> @andnot_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $9, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $10, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $10, $pop23
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $2, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $11, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $11, $pop22
 ; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $3, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $12, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $4, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push28=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $13, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $5, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $14, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $6, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $15, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $7, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push25=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $16, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $8, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $12, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $13, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $5, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $14, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $15, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $7, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $16, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $8, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
  %inv_y = xor <8 x i16> %y,
    <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
@@ -9058,62 +7546,54 @@ define <8 x i16> @bitselect_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v8i16:
 ; NO-SIMD128:         .functype bitselect_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.and $push0=, $16, $8
 ; NO-SIMD128-NEXT:    i32.const $push1=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $pop1
 ; NO-SIMD128-NEXT:    i32.and $push3=, $24, $pop2
 ; NO-SIMD128-NEXT:    i32.or $push4=, $pop0, $pop3
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.and $push7=, $15, $7
-; NO-SIMD128-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-NEXT:    i32.xor $push8=, $7, $pop47
-; NO-SIMD128-NEXT:    i32.and $push9=, $23, $pop8
-; NO-SIMD128-NEXT:    i32.or $push10=, $pop7, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.and $push13=, $14, $6
-; NO-SIMD128-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-NEXT:    i32.xor $push14=, $6, $pop46
-; NO-SIMD128-NEXT:    i32.and $push15=, $22, $pop14
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $7
+; NO-SIMD128-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $7, $pop39
+; NO-SIMD128-NEXT:    i32.and $push7=, $23, $pop6
+; NO-SIMD128-NEXT:    i32.or $push8=, $pop5, $pop7
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.and $push9=, $14, $6
+; NO-SIMD128-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-NEXT:    i32.xor $push10=, $6, $pop38
+; NO-SIMD128-NEXT:    i32.and $push11=, $22, $pop10
+; NO-SIMD128-NEXT:    i32.or $push12=, $pop9, $pop11
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.and $push13=, $13, $5
+; NO-SIMD128-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $5, $pop37
+; NO-SIMD128-NEXT:    i32.and $push15=, $21, $pop14
 ; NO-SIMD128-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.and $push19=, $13, $5
-; NO-SIMD128-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-NEXT:    i32.xor $push20=, $5, $pop45
-; NO-SIMD128-NEXT:    i32.and $push21=, $21, $pop20
-; NO-SIMD128-NEXT:    i32.or $push22=, $pop19, $pop21
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-NEXT:    i32.const $push27=, 6
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.and $push23=, $12, $4
-; NO-SIMD128-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-NEXT:    i32.xor $push24=, $4, $pop44
-; NO-SIMD128-NEXT:    i32.and $push25=, $20, $pop24
-; NO-SIMD128-NEXT:    i32.or $push26=, $pop23, $pop25
-; NO-SIMD128-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.and $push29=, $11, $3
-; NO-SIMD128-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-NEXT:    i32.xor $push30=, $3, $pop43
-; NO-SIMD128-NEXT:    i32.and $push31=, $19, $pop30
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.and $push17=, $12, $4
+; NO-SIMD128-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-NEXT:    i32.xor $push18=, $4, $pop36
+; NO-SIMD128-NEXT:    i32.and $push19=, $20, $pop18
+; NO-SIMD128-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.and $push21=, $11, $3
+; NO-SIMD128-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-NEXT:    i32.xor $push22=, $3, $pop35
+; NO-SIMD128-NEXT:    i32.and $push23=, $19, $pop22
+; NO-SIMD128-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.and $push25=, $10, $2
+; NO-SIMD128-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-NEXT:    i32.xor $push26=, $2, $pop34
+; NO-SIMD128-NEXT:    i32.and $push27=, $18, $pop26
+; NO-SIMD128-NEXT:    i32.or $push28=, $pop25, $pop27
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.and $push29=, $9, $1
+; NO-SIMD128-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-NEXT:    i32.xor $push30=, $1, $pop33
+; NO-SIMD128-NEXT:    i32.and $push31=, $17, $pop30
 ; NO-SIMD128-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop32
-; NO-SIMD128-NEXT:    i32.and $push33=, $10, $2
-; NO-SIMD128-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-NEXT:    i32.xor $push34=, $2, $pop42
-; NO-SIMD128-NEXT:    i32.and $push35=, $18, $pop34
-; NO-SIMD128-NEXT:    i32.or $push36=, $pop33, $pop35
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop36
-; NO-SIMD128-NEXT:    i32.and $push37=, $9, $1
-; NO-SIMD128-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-NEXT:    i32.xor $push38=, $1, $pop41
-; NO-SIMD128-NEXT:    i32.and $push39=, $17, $pop38
-; NO-SIMD128-NEXT:    i32.or $push40=, $pop37, $pop39
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop40
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v8i16:
@@ -9126,55 +7606,47 @@ define <8 x i16> @bitselect_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop0, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop39
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $18, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop5, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $19, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop9, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop37
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $20, $pop14
 ; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $13, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $5, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $21, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.or $push22=, $pop19, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $14, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $6, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $22, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.or $push26=, $pop23, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $15, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $7, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $23, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $13, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $5, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $21, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $14, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $6, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $22, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $15, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $7, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $23, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $pop25, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $16, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $8, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $24, $pop30
 ; NO-SIMD128-FAST-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $16, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $8, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $24, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.or $push38=, $pop35, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <8 x i16> %v1, %c
   %inv_mask = xor <8 x i16>
@@ -9203,46 +7675,38 @@ define <8 x i16> @bitselect_xor_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2
 ; NO-SIMD128-LABEL: bitselect_xor_v8i16:
 ; NO-SIMD128:         .functype bitselect_xor_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push3=, 14
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
 ; NO-SIMD128-NEXT:    i32.xor $push0=, $16, $24
 ; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $8
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $24
-; NO-SIMD128-NEXT:    i32.store16 0($pop4), $pop2
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.xor $push5=, $15, $23
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop5, $7
-; NO-SIMD128-NEXT:    i32.xor $push7=, $pop6, $23
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push13=, 10
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.xor $push10=, $14, $22
-; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $6
-; NO-SIMD128-NEXT:    i32.xor $push12=, $pop11, $22
-; NO-SIMD128-NEXT:    i32.store16 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.xor $push15=, $13, $21
-; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $5
-; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $21
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 6
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $20
-; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $4
-; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $20
-; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.xor $push23=, $11, $19
-; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $3
-; NO-SIMD128-NEXT:    i32.xor $push25=, $pop24, $19
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop25
-; NO-SIMD128-NEXT:    i32.xor $push26=, $10, $18
-; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $2
-; NO-SIMD128-NEXT:    i32.xor $push28=, $pop27, $18
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
-; NO-SIMD128-NEXT:    i32.xor $push29=, $9, $17
-; NO-SIMD128-NEXT:    i32.and $push30=, $pop29, $1
-; NO-SIMD128-NEXT:    i32.xor $push31=, $pop30, $17
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop31
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $15, $23
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $7
+; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $23
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $14, $22
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $6
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $22
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $13, $21
+; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $5
+; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $21
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop11
+; NO-SIMD128-NEXT:    i32.xor $push12=, $12, $20
+; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $4
+; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $20
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push15=, $11, $19
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $3
+; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $19
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop17
+; NO-SIMD128-NEXT:    i32.xor $push18=, $10, $18
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $2
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $18
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push21=, $9, $17
+; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $1
+; NO-SIMD128-NEXT:    i32.xor $push23=, $pop22, $17
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop23
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_v8i16:
@@ -9260,34 +7724,26 @@ define <8 x i16> @bitselect_xor_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $12, $20
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $4
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $pop12, $20
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop10), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $13, $21
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $21
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $14, $22
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $6
-; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $pop20, $22
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $15, $23
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $7
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop23), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $16, $24
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $pop29, $8
-; NO-SIMD128-FAST-NEXT:    i32.xor $push31=, $pop30, $24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $12, $20
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $20
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $13, $21
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $pop13, $21
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $14, $22
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $pop16, $22
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $15, $23
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $7
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $23
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $16, $24
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $pop22, $24
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop23
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <8 x i16> %v1, %v2
  %and = and <8 x i16> %xor1, %c
@@ -9314,62 +7770,54 @@ define <8 x i16> @bitselect_xor_reversed_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x
 ; NO-SIMD128-LABEL: bitselect_xor_reversed_v8i16:
 ; NO-SIMD128:         .functype bitselect_xor_reversed_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $24
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push1=, $8, $pop0
 ; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.xor $push4=, $pop3, $24
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.xor $push8=, $15, $23
-; NO-SIMD128-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $7, $pop47
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.xor $push10=, $pop9, $23
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.xor $push14=, $14, $22
-; NO-SIMD128-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-NEXT:    i32.xor $push13=, $6, $pop46
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push6=, $15, $23
+; NO-SIMD128-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $7, $pop39
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $23
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push10=, $14, $22
+; NO-SIMD128-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $6, $pop38
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.xor $push12=, $pop11, $22
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.xor $push14=, $13, $21
+; NO-SIMD128-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $5, $pop37
 ; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.xor $push16=, $pop15, $22
-; NO-SIMD128-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.xor $push20=, $13, $21
-; NO-SIMD128-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-NEXT:    i32.xor $push19=, $5, $pop45
-; NO-SIMD128-NEXT:    i32.and $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.xor $push22=, $pop21, $21
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-NEXT:    i32.const $push27=, 6
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.xor $push24=, $12, $20
-; NO-SIMD128-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-NEXT:    i32.xor $push23=, $4, $pop44
-; NO-SIMD128-NEXT:    i32.and $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.xor $push26=, $pop25, $20
-; NO-SIMD128-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.xor $push30=, $11, $19
-; NO-SIMD128-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-NEXT:    i32.xor $push29=, $3, $pop43
+; NO-SIMD128-NEXT:    i32.xor $push16=, $pop15, $21
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $20
+; NO-SIMD128-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-NEXT:    i32.xor $push17=, $4, $pop36
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $20
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push22=, $11, $19
+; NO-SIMD128-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-NEXT:    i32.xor $push21=, $3, $pop35
+; NO-SIMD128-NEXT:    i32.and $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.xor $push24=, $pop23, $19
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.xor $push26=, $10, $18
+; NO-SIMD128-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-NEXT:    i32.xor $push25=, $2, $pop34
+; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.xor $push28=, $pop27, $18
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.xor $push30=, $9, $17
+; NO-SIMD128-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-NEXT:    i32.xor $push29=, $1, $pop33
 ; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $pop29
-; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $19
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop32
-; NO-SIMD128-NEXT:    i32.xor $push34=, $10, $18
-; NO-SIMD128-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-NEXT:    i32.xor $push33=, $2, $pop42
-; NO-SIMD128-NEXT:    i32.and $push35=, $pop34, $pop33
-; NO-SIMD128-NEXT:    i32.xor $push36=, $pop35, $18
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop36
-; NO-SIMD128-NEXT:    i32.xor $push38=, $9, $17
-; NO-SIMD128-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-NEXT:    i32.xor $push37=, $1, $pop41
-; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.xor $push40=, $pop39, $17
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop40
+; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $17
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_reversed_v8i16:
@@ -9382,55 +7830,47 @@ define <8 x i16> @bitselect_xor_reversed_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $pop3, $17
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $10, $18
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop39
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $18
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $11, $19
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $pop11, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $12, $20
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop37
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $20
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $13, $21
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $5, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $pop21, $21
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $14, $22
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $6, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $22
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $15, $23
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $7, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $13, $21
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $5, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $21
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $14, $22
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $6, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $pop23, $22
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $15, $23
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push25=, $7, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.xor $push28=, $pop27, $23
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $16, $24
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $8, $pop33
 ; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $16, $24
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $8, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $pop37, $24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $24
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <8 x i16> %v1, %v2
  %notc = xor <8 x i16> %c, <i16 -1, i16 -1, i16 -1, i16 -1,
@@ -9458,46 +7898,38 @@ define <8 x i16> @extmul_low_s_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-LABEL: extmul_low_s_v8i16:
 ; NO-SIMD128:         .functype extmul_low_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $5
-; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $21
+; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $8
+; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $24
 ; NO-SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $3
-; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $19
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $7
+; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $23
 ; NO-SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
-; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $2
-; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $18
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop5
+; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $6
+; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $22
 ; NO-SIMD128-NEXT:    i32.mul $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop8
-; NO-SIMD128-NEXT:    i32.extend8_s $push10=, $1
-; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $17
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend8_s $push10=, $5
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $21
 ; NO-SIMD128-NEXT:    i32.mul $push11=, $pop10, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 14
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $8
-; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $24
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $4
+; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $20
 ; NO-SIMD128-NEXT:    i32.mul $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push20=, 12
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $7
-; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $23
-; NO-SIMD128-NEXT:    i32.mul $push19=, $pop18, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop21), $pop19
-; NO-SIMD128-NEXT:    i32.const $push25=, 10
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $6
-; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $22
-; NO-SIMD128-NEXT:    i32.mul $push24=, $pop23, $pop22
-; NO-SIMD128-NEXT:    i32.store16 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.extend8_s $push28=, $4
-; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $20
-; NO-SIMD128-NEXT:    i32.mul $push29=, $pop28, $pop27
-; NO-SIMD128-NEXT:    i32.store16 0($pop31), $pop29
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop14
+; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $3
+; NO-SIMD128-NEXT:    i32.extend8_s $push15=, $19
+; NO-SIMD128-NEXT:    i32.mul $push17=, $pop16, $pop15
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop17
+; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $2
+; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $18
+; NO-SIMD128-NEXT:    i32.mul $push20=, $pop19, $pop18
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop20
+; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $1
+; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $17
+; NO-SIMD128-NEXT:    i32.mul $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop23
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_low_s_v8i16:
@@ -9515,34 +7947,26 @@ define <8 x i16> @extmul_low_s_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push6=, $19
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $pop7, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $4
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $20
-; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $pop12, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop10), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $5
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push14=, $21
-; NO-SIMD128-FAST-NEXT:    i32.mul $push16=, $pop15, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push20=, $6
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $22
-; NO-SIMD128-FAST-NEXT:    i32.mul $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $7
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push24=, $23
-; NO-SIMD128-FAST-NEXT:    i32.mul $push26=, $pop25, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop23), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push30=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $24
-; NO-SIMD128-FAST-NEXT:    i32.mul $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push10=, $4
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push9=, $20
+; NO-SIMD128-FAST-NEXT:    i32.mul $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $21
+; NO-SIMD128-FAST-NEXT:    i32.mul $push14=, $pop13, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $22
+; NO-SIMD128-FAST-NEXT:    i32.mul $push17=, $pop16, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $23
+; NO-SIMD128-FAST-NEXT:    i32.mul $push20=, $pop19, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push22=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $24
+; NO-SIMD128-FAST-NEXT:    i32.mul $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop23
 ; NO-SIMD128-FAST-NEXT:    return
   %low1 = shufflevector <16 x i8> %v1, <16 x i8> undef,
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -9572,46 +7996,38 @@ define <8 x i16> @extmul_high_s_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-LABEL: extmul_high_s_v8i16:
 ; NO-SIMD128:         .functype extmul_high_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $13
-; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $29
+; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $16
+; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $32
 ; NO-SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $11
-; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $27
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $15
+; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $31
 ; NO-SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
-; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $10
-; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $26
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop5
+; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $14
+; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $30
 ; NO-SIMD128-NEXT:    i32.mul $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop8
-; NO-SIMD128-NEXT:    i32.extend8_s $push10=, $9
-; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $25
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend8_s $push10=, $13
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $29
 ; NO-SIMD128-NEXT:    i32.mul $push11=, $pop10, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 14
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $16
-; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $32
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $12
+; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $28
 ; NO-SIMD128-NEXT:    i32.mul $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push20=, 12
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $15
-; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $31
-; NO-SIMD128-NEXT:    i32.mul $push19=, $pop18, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop21), $pop19
-; NO-SIMD128-NEXT:    i32.const $push25=, 10
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $14
-; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $30
-; NO-SIMD128-NEXT:    i32.mul $push24=, $pop23, $pop22
-; NO-SIMD128-NEXT:    i32.store16 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.extend8_s $push28=, $12
-; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $28
-; NO-SIMD128-NEXT:    i32.mul $push29=, $pop28, $pop27
-; NO-SIMD128-NEXT:    i32.store16 0($pop31), $pop29
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop14
+; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $11
+; NO-SIMD128-NEXT:    i32.extend8_s $push15=, $27
+; NO-SIMD128-NEXT:    i32.mul $push17=, $pop16, $pop15
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop17
+; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $10
+; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $26
+; NO-SIMD128-NEXT:    i32.mul $push20=, $pop19, $pop18
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop20
+; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $25
+; NO-SIMD128-NEXT:    i32.mul $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop23
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_high_s_v8i16:
@@ -9629,34 +8045,26 @@ define <8 x i16> @extmul_high_s_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push6=, $27
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $pop7, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $12
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $28
-; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $pop12, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop10), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push14=, $29
-; NO-SIMD128-FAST-NEXT:    i32.mul $push16=, $pop15, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push20=, $14
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $30
-; NO-SIMD128-FAST-NEXT:    i32.mul $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $15
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push24=, $31
-; NO-SIMD128-FAST-NEXT:    i32.mul $push26=, $pop25, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop23), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push30=, $16
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $32
-; NO-SIMD128-FAST-NEXT:    i32.mul $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push10=, $12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push9=, $28
+; NO-SIMD128-FAST-NEXT:    i32.mul $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $13
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $29
+; NO-SIMD128-FAST-NEXT:    i32.mul $push14=, $pop13, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $30
+; NO-SIMD128-FAST-NEXT:    i32.mul $push17=, $pop16, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $31
+; NO-SIMD128-FAST-NEXT:    i32.mul $push20=, $pop19, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push22=, $16
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $32
+; NO-SIMD128-FAST-NEXT:    i32.mul $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop23
 ; NO-SIMD128-FAST-NEXT:    return
   %high1 = shufflevector <16 x i8> %v1, <16 x i8> undef,
            <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -9687,61 +8095,53 @@ define <8 x i16> @extmul_low_u_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128:         .functype extmul_low_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push2=, $5, $pop0
-; NO-SIMD128-NEXT:    i32.const $push47=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $21, $pop47
-; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push46=, 255
-; NO-SIMD128-NEXT:    i32.and $push5=, $3, $pop46
-; NO-SIMD128-NEXT:    i32.const $push45=, 255
-; NO-SIMD128-NEXT:    i32.and $push4=, $19, $pop45
-; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push44=, 255
-; NO-SIMD128-NEXT:    i32.and $push8=, $2, $pop44
-; NO-SIMD128-NEXT:    i32.const $push43=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $18, $pop43
-; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push42=, 255
-; NO-SIMD128-NEXT:    i32.and $push11=, $1, $pop42
-; NO-SIMD128-NEXT:    i32.const $push41=, 255
-; NO-SIMD128-NEXT:    i32.and $push10=, $17, $pop41
-; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 14
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push40=, 255
-; NO-SIMD128-NEXT:    i32.and $push14=, $8, $pop40
+; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push39=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $24, $pop39
-; NO-SIMD128-NEXT:    i32.mul $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push21=, 12
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.and $push1=, $24, $pop39
+; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push38=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $7, $pop38
+; NO-SIMD128-NEXT:    i32.and $push5=, $7, $pop38
 ; NO-SIMD128-NEXT:    i32.const $push37=, 255
-; NO-SIMD128-NEXT:    i32.and $push18=, $23, $pop37
-; NO-SIMD128-NEXT:    i32.mul $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-NEXT:    i32.and $push4=, $23, $pop37
+; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push36=, 255
-; NO-SIMD128-NEXT:    i32.and $push24=, $6, $pop36
+; NO-SIMD128-NEXT:    i32.and $push8=, $6, $pop36
 ; NO-SIMD128-NEXT:    i32.const $push35=, 255
-; NO-SIMD128-NEXT:    i32.and $push23=, $22, $pop35
-; NO-SIMD128-NEXT:    i32.mul $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-NEXT:    i32.and $push7=, $22, $pop35
+; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop9
 ; NO-SIMD128-NEXT:    i32.const $push34=, 255
-; NO-SIMD128-NEXT:    i32.and $push29=, $4, $pop34
+; NO-SIMD128-NEXT:    i32.and $push11=, $5, $pop34
 ; NO-SIMD128-NEXT:    i32.const $push33=, 255
-; NO-SIMD128-NEXT:    i32.and $push28=, $20, $pop33
-; NO-SIMD128-NEXT:    i32.mul $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-NEXT:    i32.and $push10=, $21, $pop33
+; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push32=, 255
+; NO-SIMD128-NEXT:    i32.and $push14=, $4, $pop32
+; NO-SIMD128-NEXT:    i32.const $push31=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $20, $pop31
+; NO-SIMD128-NEXT:    i32.mul $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push30=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $3, $pop30
+; NO-SIMD128-NEXT:    i32.const $push29=, 255
+; NO-SIMD128-NEXT:    i32.and $push16=, $19, $pop29
+; NO-SIMD128-NEXT:    i32.mul $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push28=, 255
+; NO-SIMD128-NEXT:    i32.and $push20=, $2, $pop28
+; NO-SIMD128-NEXT:    i32.const $push27=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $18, $pop27
+; NO-SIMD128-NEXT:    i32.mul $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop21
+; NO-SIMD128-NEXT:    i32.const $push26=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $1, $pop26
+; NO-SIMD128-NEXT:    i32.const $push25=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $17, $pop25
+; NO-SIMD128-NEXT:    i32.mul $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop24
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_low_u_v8i16:
@@ -9749,60 +8149,52 @@ define <8 x i16> @extmul_low_u_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop39
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $20, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $5, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $21, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.mul $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
 ; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $6, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $22, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.mul $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $7, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop36
 ; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $23, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.mul $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop34
 ; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $24, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.mul $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $20, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $5, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $21, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.mul $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $6, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $22, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.mul $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $7, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $23, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.mul $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $8, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $24, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.mul $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop24
 ; NO-SIMD128-FAST-NEXT:    return
   %low1 = shufflevector <16 x i8> %v1, <16 x i8> undef,
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -9833,61 +8225,53 @@ define <8 x i16> @extmul_high_u_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128:         .functype extmul_high_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push2=, $13, $pop0
-; NO-SIMD128-NEXT:    i32.const $push47=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $29, $pop47
-; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push46=, 255
-; NO-SIMD128-NEXT:    i32.and $push5=, $11, $pop46
-; NO-SIMD128-NEXT:    i32.const $push45=, 255
-; NO-SIMD128-NEXT:    i32.and $push4=, $27, $pop45
-; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push44=, 255
-; NO-SIMD128-NEXT:    i32.and $push8=, $10, $pop44
-; NO-SIMD128-NEXT:    i32.const $push43=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $26, $pop43
-; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push42=, 255
-; NO-SIMD128-NEXT:    i32.and $push11=, $9, $pop42
-; NO-SIMD128-NEXT:    i32.const $push41=, 255
-; NO-SIMD128-NEXT:    i32.and $push10=, $25, $pop41
-; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 14
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push40=, 255
-; NO-SIMD128-NEXT:    i32.and $push14=, $16, $pop40
+; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push39=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $32, $pop39
-; NO-SIMD128-NEXT:    i32.mul $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push21=, 12
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop39
+; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push38=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $15, $pop38
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $pop38
 ; NO-SIMD128-NEXT:    i32.const $push37=, 255
-; NO-SIMD128-NEXT:    i32.and $push18=, $31, $pop37
-; NO-SIMD128-NEXT:    i32.mul $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-NEXT:    i32.and $push4=, $31, $pop37
+; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push36=, 255
-; NO-SIMD128-NEXT:    i32.and $push24=, $14, $pop36
+; NO-SIMD128-NEXT:    i32.and $push8=, $14, $pop36
 ; NO-SIMD128-NEXT:    i32.const $push35=, 255
-; NO-SIMD128-NEXT:    i32.and $push23=, $30, $pop35
-; NO-SIMD128-NEXT:    i32.mul $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-NEXT:    i32.and $push7=, $30, $pop35
+; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop9
 ; NO-SIMD128-NEXT:    i32.const $push34=, 255
-; NO-SIMD128-NEXT:    i32.and $push29=, $12, $pop34
+; NO-SIMD128-NEXT:    i32.and $push11=, $13, $pop34
 ; NO-SIMD128-NEXT:    i32.const $push33=, 255
-; NO-SIMD128-NEXT:    i32.and $push28=, $28, $pop33
-; NO-SIMD128-NEXT:    i32.mul $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-NEXT:    i32.and $push10=, $29, $pop33
+; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push32=, 255
+; NO-SIMD128-NEXT:    i32.and $push14=, $12, $pop32
+; NO-SIMD128-NEXT:    i32.const $push31=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $28, $pop31
+; NO-SIMD128-NEXT:    i32.mul $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push30=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $11, $pop30
+; NO-SIMD128-NEXT:    i32.const $push29=, 255
+; NO-SIMD128-NEXT:    i32.and $push16=, $27, $pop29
+; NO-SIMD128-NEXT:    i32.mul $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push28=, 255
+; NO-SIMD128-NEXT:    i32.and $push20=, $10, $pop28
+; NO-SIMD128-NEXT:    i32.const $push27=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $26, $pop27
+; NO-SIMD128-NEXT:    i32.mul $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop21
+; NO-SIMD128-NEXT:    i32.const $push26=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $9, $pop26
+; NO-SIMD128-NEXT:    i32.const $push25=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $25, $pop25
+; NO-SIMD128-NEXT:    i32.mul $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop24
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_high_u_v8i16:
@@ -9895,60 +8279,52 @@ define <8 x i16> @extmul_high_u_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $9, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $25, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $25, $pop39
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $26, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $11, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $27, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $12, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $28, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $13, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $29, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.mul $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
 ; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $14, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $30, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.mul $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $26, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $15, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $11, $pop36
 ; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $31, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.mul $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $27, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $16, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $12, $pop34
 ; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $32, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.mul $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $28, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $13, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $29, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.mul $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $14, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $30, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.mul $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $15, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $31, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.mul $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $16, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $32, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.mul $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop24
 ; NO-SIMD128-FAST-NEXT:    return
   %high1 = shufflevector <16 x i8> %v1, <16 x i8> undef,
            <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -9979,16 +8355,14 @@ define <4 x i32> @add_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: add_v4i32:
 ; NO-SIMD128:         .functype add_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.add $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.add $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.add $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.add $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.add $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.add $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.add $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.add $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: add_v4i32:
@@ -10000,10 +8374,8 @@ define <4 x i32> @add_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.add $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -10025,16 +8397,14 @@ define <4 x i32> @sub_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: sub_v4i32:
 ; NO-SIMD128:         .functype sub_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.sub $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.sub $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.sub $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.sub $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.sub $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.sub $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.sub $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: sub_v4i32:
@@ -10046,10 +8416,8 @@ define <4 x i32> @sub_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -10071,16 +8439,14 @@ define <4 x i32> @mul_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: mul_v4i32:
 ; NO-SIMD128:         .functype mul_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.mul $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.mul $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.mul $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.mul $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.mul $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.mul $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.mul $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.mul $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: mul_v4i32:
@@ -10092,10 +8458,8 @@ define <4 x i32> @mul_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = mul <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -10117,20 +8481,18 @@ define <4 x i32> @min_s_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: min_s_v4i32:
 ; NO-SIMD128:         .functype min_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.lt_s $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.select $push1=, $3, $7, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.lt_s $push2=, $2, $6
-; NO-SIMD128-NEXT:    i32.select $push3=, $2, $6, $pop2
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    i32.lt_s $push4=, $1, $5
-; NO-SIMD128-NEXT:    i32.select $push5=, $1, $5, $pop4
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.lt_s $push6=, $4, $8
-; NO-SIMD128-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.lt_s $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.select $push1=, $4, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.lt_s $push2=, $3, $7
+; NO-SIMD128-NEXT:    i32.select $push3=, $3, $7, $pop2
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.lt_s $push4=, $2, $6
+; NO-SIMD128-NEXT:    i32.select $push5=, $2, $6, $pop4
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.lt_s $push6=, $1, $5
+; NO-SIMD128-NEXT:    i32.select $push7=, $1, $5, $pop6
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_s_v4i32:
@@ -10145,11 +8507,9 @@ define <4 x i32> @min_s_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push4=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.select $push5=, $3, $7, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push6=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp slt <4 x i32> %x, %y
   %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
@@ -10172,20 +8532,18 @@ define <4 x i32> @min_u_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: min_u_v4i32:
 ; NO-SIMD128:         .functype min_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.lt_u $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.select $push1=, $3, $7, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.lt_u $push2=, $2, $6
-; NO-SIMD128-NEXT:    i32.select $push3=, $2, $6, $pop2
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    i32.lt_u $push4=, $1, $5
-; NO-SIMD128-NEXT:    i32.select $push5=, $1, $5, $pop4
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.lt_u $push6=, $4, $8
-; NO-SIMD128-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.lt_u $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.select $push1=, $4, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.lt_u $push2=, $3, $7
+; NO-SIMD128-NEXT:    i32.select $push3=, $3, $7, $pop2
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.lt_u $push4=, $2, $6
+; NO-SIMD128-NEXT:    i32.select $push5=, $2, $6, $pop4
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.lt_u $push6=, $1, $5
+; NO-SIMD128-NEXT:    i32.select $push7=, $1, $5, $pop6
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_u_v4i32:
@@ -10200,11 +8558,9 @@ define <4 x i32> @min_u_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push4=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.select $push5=, $3, $7, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push6=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ult <4 x i32> %x, %y
   %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
@@ -10227,20 +8583,18 @@ define <4 x i32> @max_s_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: max_s_v4i32:
 ; NO-SIMD128:         .functype max_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.gt_s $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.select $push1=, $3, $7, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.gt_s $push2=, $2, $6
-; NO-SIMD128-NEXT:    i32.select $push3=, $2, $6, $pop2
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    i32.gt_s $push4=, $1, $5
-; NO-SIMD128-NEXT:    i32.select $push5=, $1, $5, $pop4
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.gt_s $push6=, $4, $8
-; NO-SIMD128-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.gt_s $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.select $push1=, $4, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.gt_s $push2=, $3, $7
+; NO-SIMD128-NEXT:    i32.select $push3=, $3, $7, $pop2
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.gt_s $push4=, $2, $6
+; NO-SIMD128-NEXT:    i32.select $push5=, $2, $6, $pop4
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.gt_s $push6=, $1, $5
+; NO-SIMD128-NEXT:    i32.select $push7=, $1, $5, $pop6
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_s_v4i32:
@@ -10255,11 +8609,9 @@ define <4 x i32> @max_s_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push4=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.select $push5=, $3, $7, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push6=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp sgt <4 x i32> %x, %y
   %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
@@ -10282,20 +8634,18 @@ define <4 x i32> @max_u_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: max_u_v4i32:
 ; NO-SIMD128:         .functype max_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.gt_u $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.select $push1=, $3, $7, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.gt_u $push2=, $2, $6
-; NO-SIMD128-NEXT:    i32.select $push3=, $2, $6, $pop2
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    i32.gt_u $push4=, $1, $5
-; NO-SIMD128-NEXT:    i32.select $push5=, $1, $5, $pop4
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.gt_u $push6=, $4, $8
-; NO-SIMD128-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.gt_u $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.select $push1=, $4, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.gt_u $push2=, $3, $7
+; NO-SIMD128-NEXT:    i32.select $push3=, $3, $7, $pop2
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.gt_u $push4=, $2, $6
+; NO-SIMD128-NEXT:    i32.select $push5=, $2, $6, $pop4
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.gt_u $push6=, $1, $5
+; NO-SIMD128-NEXT:    i32.select $push7=, $1, $5, $pop6
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_u_v4i32:
@@ -10310,11 +8660,9 @@ define <4 x i32> @max_u_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push4=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.select $push5=, $3, $7, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push6=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ugt <4 x i32> %x, %y
   %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
@@ -10337,63 +8685,59 @@ define <4 x i32> @abs_v4i32(<4 x i32> %x) {
 ; NO-SIMD128-LABEL: abs_v4i32:
 ; NO-SIMD128:         .functype abs_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
 ; NO-SIMD128-NEXT:    i32.const $push0=, 31
-; NO-SIMD128-NEXT:    i32.shr_s $push21=, $4, $pop0
-; NO-SIMD128-NEXT:    local.tee $push20=, $5=, $pop21
-; NO-SIMD128-NEXT:    i32.xor $push1=, $4, $pop20
+; NO-SIMD128-NEXT:    i32.shr_s $push19=, $4, $pop0
+; NO-SIMD128-NEXT:    local.tee $push18=, $5=, $pop19
+; NO-SIMD128-NEXT:    i32.xor $push1=, $4, $pop18
 ; NO-SIMD128-NEXT:    i32.sub $push2=, $pop1, $5
-; NO-SIMD128-NEXT:    i32.store 0($pop4), $pop2
-; NO-SIMD128-NEXT:    i32.const $push19=, 31
-; NO-SIMD128-NEXT:    i32.shr_s $push18=, $3, $pop19
-; NO-SIMD128-NEXT:    local.tee $push17=, $4=, $pop18
-; NO-SIMD128-NEXT:    i32.xor $push5=, $3, $pop17
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push17=, 31
+; NO-SIMD128-NEXT:    i32.shr_s $push16=, $3, $pop17
+; NO-SIMD128-NEXT:    local.tee $push15=, $4=, $pop16
+; NO-SIMD128-NEXT:    i32.xor $push3=, $3, $pop15
+; NO-SIMD128-NEXT:    i32.sub $push4=, $pop3, $4
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push14=, 31
+; NO-SIMD128-NEXT:    i32.shr_s $push13=, $2, $pop14
+; NO-SIMD128-NEXT:    local.tee $push12=, $4=, $pop13
+; NO-SIMD128-NEXT:    i32.xor $push5=, $2, $pop12
 ; NO-SIMD128-NEXT:    i32.sub $push6=, $pop5, $4
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push16=, 31
-; NO-SIMD128-NEXT:    i32.shr_s $push15=, $2, $pop16
-; NO-SIMD128-NEXT:    local.tee $push14=, $4=, $pop15
-; NO-SIMD128-NEXT:    i32.xor $push7=, $2, $pop14
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push11=, 31
+; NO-SIMD128-NEXT:    i32.shr_s $push10=, $1, $pop11
+; NO-SIMD128-NEXT:    local.tee $push9=, $4=, $pop10
+; NO-SIMD128-NEXT:    i32.xor $push7=, $1, $pop9
 ; NO-SIMD128-NEXT:    i32.sub $push8=, $pop7, $4
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push13=, 31
-; NO-SIMD128-NEXT:    i32.shr_s $push12=, $1, $pop13
-; NO-SIMD128-NEXT:    local.tee $push11=, $4=, $pop12
-; NO-SIMD128-NEXT:    i32.xor $push9=, $1, $pop11
-; NO-SIMD128-NEXT:    i32.sub $push10=, $pop9, $4
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop10
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: abs_v4i32:
 ; NO-SIMD128-FAST:         .functype abs_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 31
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push21=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    local.tee $push20=, $5=, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push19=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push18=, $5=, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop1, $5
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 31
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push18=, $2, $pop19
-; NO-SIMD128-FAST-NEXT:    local.tee $push17=, $1=, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $2, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 31
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push16=, $2, $pop17
+; NO-SIMD128-FAST-NEXT:    local.tee $push15=, $1=, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $2, $pop15
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $pop3, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 31
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push15=, $3, $pop16
-; NO-SIMD128-FAST-NEXT:    local.tee $push14=, $2=, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $3, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 31
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push13=, $3, $pop14
+; NO-SIMD128-FAST-NEXT:    local.tee $push12=, $2=, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $3, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop5, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 31
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $4, $pop13
-; NO-SIMD128-FAST-NEXT:    local.tee $push11=, $0=, $pop12
-; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $4, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.sub $push8=, $pop7, $0
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 31
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push10=, $4, $pop11
+; NO-SIMD128-FAST-NEXT:    local.tee $push9=, $3=, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.sub $push8=, $pop7, $3
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <4 x i32> zeroinitializer, %x
   %b = icmp slt <4 x i32> %x, zeroinitializer
@@ -10418,19 +8762,17 @@ define <4 x i32> @neg_v4i32(<4 x i32> %x) {
 ; NO-SIMD128:         .functype neg_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $3
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push9=, 0
-; NO-SIMD128-NEXT:    i32.sub $push2=, $pop9, $2
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push8=, 0
-; NO-SIMD128-NEXT:    i32.sub $push3=, $pop8, $1
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $4
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
 ; NO-SIMD128-NEXT:    i32.const $push7=, 0
-; NO-SIMD128-NEXT:    i32.sub $push4=, $pop7, $4
-; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.sub $push2=, $pop7, $3
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push6=, 0
+; NO-SIMD128-NEXT:    i32.sub $push3=, $pop6, $2
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 0
+; NO-SIMD128-NEXT:    i32.sub $push4=, $pop5, $1
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop4
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: neg_v4i32:
@@ -10439,17 +8781,15 @@ define <4 x i32> @neg_v4i32(<4 x i32> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 0
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push1=, $pop0, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop9, $2
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop7, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop8, $3
+; NO-SIMD128-FAST-NEXT:    i32.const $push6=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop6, $3
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop7, $4
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push5=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $pop5, $4
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <4 x i32> <i32 0, i32 0, i32 0, i32 0>, %x
   ret <4 x i32> %a
@@ -10471,16 +8811,14 @@ define <4 x i32> @shl_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-LABEL: shl_v4i32:
 ; NO-SIMD128:         .functype shl_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shl $push0=, $3, $5
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shl $push1=, $2, $5
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shl $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shl $push3=, $4, $5
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shl $push0=, $4, $5
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shl $push1=, $3, $5
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shl $push2=, $2, $5
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shl $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_v4i32:
@@ -10492,10 +8830,8 @@ define <4 x i32> @shl_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $3, $5
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $4, $5
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $4, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <4 x i32> undef, i32 %x, i32 0
   %s = shufflevector <4 x i32> %t, <4 x i32> undef,
@@ -10523,19 +8859,17 @@ define <4 x i32> @shl_const_v4i32(<4 x i32> %v) {
 ; NO-SIMD128:         .functype shl_const_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 5
-; NO-SIMD128-NEXT:    i32.shl $push1=, $3, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push9=, 5
-; NO-SIMD128-NEXT:    i32.shl $push2=, $2, $pop9
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push8=, 5
-; NO-SIMD128-NEXT:    i32.shl $push3=, $1, $pop8
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.shl $push1=, $4, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
 ; NO-SIMD128-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-NEXT:    i32.shl $push4=, $4, $pop7
-; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.shl $push2=, $3, $pop7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push6=, 5
+; NO-SIMD128-NEXT:    i32.shl $push3=, $2, $pop6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 5
+; NO-SIMD128-NEXT:    i32.shl $push4=, $1, $pop5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop4
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_const_v4i32:
@@ -10544,17 +8878,15 @@ define <4 x i32> @shl_const_v4i32(<4 x i32> %v) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 5
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push6=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push5=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $4, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <4 x i32> %v, <i32 5, i32 5, i32 5, i32 5>
   ret <4 x i32> %a
@@ -10606,16 +8938,14 @@ define <4 x i32> @shl_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-LABEL: shl_vec_v4i32:
 ; NO-SIMD128:         .functype shl_vec_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shl $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shl $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shl $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shl $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shl $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shl $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shl $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shl $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_vec_v4i32:
@@ -10627,10 +8957,8 @@ define <4 x i32> @shl_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <4 x i32> %v, %x
   ret <4 x i32> %a
@@ -10652,16 +8980,14 @@ define <4 x i32> @shr_s_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-LABEL: shr_s_v4i32:
 ; NO-SIMD128:         .functype shr_s_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shr_s $push0=, $3, $5
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shr_s $push1=, $2, $5
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shr_s $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shr_s $push3=, $4, $5
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shr_s $push0=, $4, $5
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shr_s $push1=, $3, $5
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shr_s $push2=, $2, $5
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shr_s $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_v4i32:
@@ -10673,10 +8999,8 @@ define <4 x i32> @shr_s_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $3, $5
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push5=, $4, $5
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push3=, $4, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <4 x i32> undef, i32 %x, i32 0
   %s = shufflevector <4 x i32> %t, <4 x i32> undef,
@@ -10731,16 +9055,14 @@ define <4 x i32> @shr_s_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-LABEL: shr_s_vec_v4i32:
 ; NO-SIMD128:         .functype shr_s_vec_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shr_s $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shr_s $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shr_s $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shr_s $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shr_s $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shr_s $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shr_s $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shr_s $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_vec_v4i32:
@@ -10752,10 +9074,8 @@ define <4 x i32> @shr_s_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = ashr <4 x i32> %v, %x
   ret <4 x i32> %a
@@ -10777,16 +9097,14 @@ define <4 x i32> @shr_u_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-LABEL: shr_u_v4i32:
 ; NO-SIMD128:         .functype shr_u_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shr_u $push0=, $3, $5
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shr_u $push1=, $2, $5
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shr_u $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shr_u $push3=, $4, $5
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shr_u $push0=, $4, $5
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shr_u $push1=, $3, $5
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shr_u $push2=, $2, $5
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shr_u $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_v4i32:
@@ -10798,10 +9116,8 @@ define <4 x i32> @shr_u_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $3, $5
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $4, $5
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push3=, $4, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <4 x i32> undef, i32 %x, i32 0
   %s = shufflevector <4 x i32> %t, <4 x i32> undef,
@@ -10856,16 +9172,14 @@ define <4 x i32> @shr_u_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-LABEL: shr_u_vec_v4i32:
 ; NO-SIMD128:         .functype shr_u_vec_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shr_u $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shr_u $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shr_u $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shr_u $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shr_u $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shr_u $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shr_u $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shr_u $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_vec_v4i32:
@@ -10877,10 +9191,8 @@ define <4 x i32> @shr_u_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = lshr <4 x i32> %v, %x
   ret <4 x i32> %a
@@ -10902,16 +9214,14 @@ define <4 x i32> @and_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: and_v4i32:
 ; NO-SIMD128:         .functype and_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.and $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.and $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.and $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.and $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.and $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.and $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.and $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: and_v4i32:
@@ -10923,10 +9233,8 @@ define <4 x i32> @and_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = and <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -10948,16 +9256,14 @@ define <4 x i32> @or_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: or_v4i32:
 ; NO-SIMD128:         .functype or_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.or $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.or $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.or $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.or $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.or $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.or $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.or $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.or $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: or_v4i32:
@@ -10969,10 +9275,8 @@ define <4 x i32> @or_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.or $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.or $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = or <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -10994,16 +9298,14 @@ define <4 x i32> @xor_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: xor_v4i32:
 ; NO-SIMD128:         .functype xor_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.xor $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.xor $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.xor $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.xor $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.xor $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: xor_v4i32:
@@ -11015,10 +9317,8 @@ define <4 x i32> @xor_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -11041,19 +9341,17 @@ define <4 x i32> @not_v4i32(<4 x i32> %x) {
 ; NO-SIMD128:         .functype not_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $3, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push9=, -1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $2, $pop9
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push8=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $1, $pop8
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.xor $push1=, $4, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
 ; NO-SIMD128-NEXT:    i32.const $push7=, -1
-; NO-SIMD128-NEXT:    i32.xor $push4=, $4, $pop7
-; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push2=, $3, $pop7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push6=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $2, $pop6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, -1
+; NO-SIMD128-NEXT:    i32.xor $push4=, $1, $pop5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop4
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: not_v4i32:
@@ -11062,17 +9360,15 @@ define <4 x i32> @not_v4i32(<4 x i32> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push6=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push5=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $4, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
   ret <4 x i32> %a
@@ -11096,23 +9392,21 @@ define <4 x i32> @andnot_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128:         .functype andnot_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $7, $pop0
-; NO-SIMD128-NEXT:    i32.and $push2=, $3, $pop1
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push13=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $6, $pop13
-; NO-SIMD128-NEXT:    i32.and $push4=, $2, $pop3
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push12=, -1
-; NO-SIMD128-NEXT:    i32.xor $push5=, $5, $pop12
-; NO-SIMD128-NEXT:    i32.and $push6=, $1, $pop5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    i32.xor $push1=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.and $push2=, $4, $pop1
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
 ; NO-SIMD128-NEXT:    i32.const $push11=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $8, $pop11
-; NO-SIMD128-NEXT:    i32.and $push8=, $4, $pop7
-; NO-SIMD128-NEXT:    i32.store 0($pop10), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push3=, $7, $pop11
+; NO-SIMD128-NEXT:    i32.and $push4=, $3, $pop3
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push10=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $6, $pop10
+; NO-SIMD128-NEXT:    i32.and $push6=, $2, $pop5
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push9=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $5, $pop9
+; NO-SIMD128-NEXT:    i32.and $push8=, $1, $pop7
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: andnot_v4i32:
@@ -11122,20 +9416,18 @@ define <4 x i32> @andnot_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $5, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $6, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $6, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $2, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push12=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $7, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $7, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $3, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $8, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $4, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop8), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $8, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
  %inv_y = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
  %a = and <4 x i32> %x, %inv_y
@@ -11161,32 +9453,30 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v4i32:
 ; NO-SIMD128:         .functype bitselect_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.const $push1=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $4, $pop1
 ; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $12
 ; NO-SIMD128-NEXT:    i32.and $push0=, $4, $8
 ; NO-SIMD128-NEXT:    i32.or $push4=, $pop3, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-NEXT:    i32.xor $push8=, $3, $pop21
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $11
-; NO-SIMD128-NEXT:    i32.and $push7=, $3, $7
-; NO-SIMD128-NEXT:    i32.or $push10=, $pop9, $pop7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop10
-; NO-SIMD128-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-NEXT:    i32.xor $push12=, $2, $pop20
-; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $10
-; NO-SIMD128-NEXT:    i32.and $push11=, $2, $6
-; NO-SIMD128-NEXT:    i32.or $push14=, $pop13, $pop11
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop14
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop4
 ; NO-SIMD128-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-NEXT:    i32.xor $push16=, $1, $pop19
-; NO-SIMD128-NEXT:    i32.and $push17=, $pop16, $9
-; NO-SIMD128-NEXT:    i32.and $push15=, $1, $5
-; NO-SIMD128-NEXT:    i32.or $push18=, $pop17, $pop15
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop18
+; NO-SIMD128-NEXT:    i32.xor $push6=, $3, $pop19
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $11
+; NO-SIMD128-NEXT:    i32.and $push5=, $3, $7
+; NO-SIMD128-NEXT:    i32.or $push8=, $pop7, $pop5
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-NEXT:    i32.xor $push10=, $2, $pop18
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $10
+; NO-SIMD128-NEXT:    i32.and $push9=, $2, $6
+; NO-SIMD128-NEXT:    i32.or $push12=, $pop11, $pop9
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $1, $pop17
+; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $9
+; NO-SIMD128-NEXT:    i32.and $push13=, $1, $5
+; NO-SIMD128-NEXT:    i32.or $push16=, $pop15, $pop13
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v4i32:
@@ -11198,26 +9488,24 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $1, $5
 ; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop3, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop19
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $10
 ; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $6
 ; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop7, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $11
 ; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop11, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $12
 ; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop15, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop18), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <4 x i32> %c, %v1
   %inv_mask = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %c
@@ -11244,24 +9532,22 @@ define <4 x i32> @bitselect_xor_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2
 ; NO-SIMD128-LABEL: bitselect_xor_v4i32:
 ; NO-SIMD128:         .functype bitselect_xor_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
 ; NO-SIMD128-NEXT:    i32.xor $push0=, $8, $12
 ; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $4
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $12
-; NO-SIMD128-NEXT:    i32.store 0($pop4), $pop2
-; NO-SIMD128-NEXT:    i32.xor $push5=, $7, $11
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop5, $3
-; NO-SIMD128-NEXT:    i32.xor $push7=, $pop6, $11
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop7
-; NO-SIMD128-NEXT:    i32.xor $push8=, $6, $10
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $2
-; NO-SIMD128-NEXT:    i32.xor $push10=, $pop9, $10
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop10
-; NO-SIMD128-NEXT:    i32.xor $push11=, $5, $9
-; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $1
-; NO-SIMD128-NEXT:    i32.xor $push13=, $pop12, $9
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop13
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $7, $11
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $3
+; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $11
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $6, $10
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $2
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $10
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $5, $9
+; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $1
+; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $9
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop11
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_v4i32:
@@ -11279,12 +9565,10 @@ define <4 x i32> @bitselect_xor_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $8, $12
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $4
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $pop12, $12
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop10), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $8, $12
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $12
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop11
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <4 x i32> %v1, %v2
  %and = and <4 x i32> %xor1, %c
@@ -11311,32 +9595,30 @@ define <4 x i32> @bitselect_xor_reversed_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x
 ; NO-SIMD128-LABEL: bitselect_xor_reversed_v4i32:
 ; NO-SIMD128:         .functype bitselect_xor_reversed_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $12
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push1=, $4, $pop0
 ; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.xor $push4=, $pop3, $12
-; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.xor $push8=, $7, $11
-; NO-SIMD128-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $3, $pop21
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.xor $push10=, $pop9, $11
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop10
-; NO-SIMD128-NEXT:    i32.xor $push12=, $6, $10
-; NO-SIMD128-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-NEXT:    i32.xor $push11=, $2, $pop20
-; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $pop11
-; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $10
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop14
-; NO-SIMD128-NEXT:    i32.xor $push16=, $5, $9
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push6=, $7, $11
 ; NO-SIMD128-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-NEXT:    i32.xor $push15=, $1, $pop19
-; NO-SIMD128-NEXT:    i32.and $push17=, $pop16, $pop15
-; NO-SIMD128-NEXT:    i32.xor $push18=, $pop17, $9
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop18
+; NO-SIMD128-NEXT:    i32.xor $push5=, $3, $pop19
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $11
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push10=, $6, $10
+; NO-SIMD128-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $2, $pop18
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.xor $push12=, $pop11, $10
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.xor $push14=, $5, $9
+; NO-SIMD128-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $1, $pop17
+; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.xor $push16=, $pop15, $9
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_reversed_v4i32:
@@ -11349,25 +9631,23 @@ define <4 x i32> @bitselect_xor_reversed_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $pop3, $9
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $6, $10
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop19
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $10
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $7, $11
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $pop11, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $8, $12
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $12
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop18), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <4 x i32> %v1, %v2
  %notc = xor <4 x i32> %c, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -11394,24 +9674,22 @@ define <4 x i32> @extmul_low_s_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-LABEL: extmul_low_s_v4i32:
 ; NO-SIMD128:         .functype extmul_low_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $3
-; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $11
+; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $4
+; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $12
 ; NO-SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $2
-; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $10
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $3
+; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $11
 ; NO-SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
-; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $1
-; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $9
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop5
+; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $2
+; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $10
 ; NO-SIMD128-NEXT:    i32.mul $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 12
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.extend16_s $push10=, $4
-; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $12
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend16_s $push10=, $1
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $9
 ; NO-SIMD128-NEXT:    i32.mul $push11=, $pop10, $pop9
-; NO-SIMD128-NEXT:    i32.store 0($pop13), $pop11
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop11
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_low_s_v4i32:
@@ -11429,12 +9707,10 @@ define <4 x i32> @extmul_low_s_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push6=, $11
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $pop7, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push12=, $4
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $12
-; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $pop12, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop10), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push10=, $4
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push9=, $12
+; NO-SIMD128-FAST-NEXT:    i32.mul $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop11
 ; NO-SIMD128-FAST-NEXT:    return
   %low1 = shufflevector <8 x i16> %v1, <8 x i16> undef,
            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -11464,24 +9740,22 @@ define <4 x i32> @extmul_high_s_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-LABEL: extmul_high_s_v4i32:
 ; NO-SIMD128:         .functype extmul_high_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $7
-; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $15
+; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $8
+; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $16
 ; NO-SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $6
-; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $14
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $7
+; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $15
 ; NO-SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
-; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $5
-; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $13
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop5
+; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $6
+; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $14
 ; NO-SIMD128-NEXT:    i32.mul $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 12
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.extend16_s $push10=, $8
-; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $16
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend16_s $push10=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $13
 ; NO-SIMD128-NEXT:    i32.mul $push11=, $pop10, $pop9
-; NO-SIMD128-NEXT:    i32.store 0($pop13), $pop11
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop11
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_high_s_v4i32:
@@ -11499,12 +9773,10 @@ define <4 x i32> @extmul_high_s_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push6=, $15
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $pop7, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push12=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $16
-; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $pop12, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop10), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push10=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push9=, $16
+; NO-SIMD128-FAST-NEXT:    i32.mul $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop11
 ; NO-SIMD128-FAST-NEXT:    return
   %high1 = shufflevector <8 x i16> %v1, <8 x i16> undef,
            <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -11535,31 +9807,29 @@ define <4 x i32> @extmul_low_u_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128:         .functype extmul_low_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push2=, $3, $pop0
-; NO-SIMD128-NEXT:    i32.const $push21=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $11, $pop21
-; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push20=, 65535
-; NO-SIMD128-NEXT:    i32.and $push5=, $2, $pop20
+; NO-SIMD128-NEXT:    i32.and $push2=, $4, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push19=, 65535
-; NO-SIMD128-NEXT:    i32.and $push4=, $10, $pop19
-; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.and $push1=, $12, $pop19
+; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push18=, 65535
-; NO-SIMD128-NEXT:    i32.and $push8=, $1, $pop18
+; NO-SIMD128-NEXT:    i32.and $push5=, $3, $pop18
 ; NO-SIMD128-NEXT:    i32.const $push17=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $9, $pop17
-; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 12
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-NEXT:    i32.and $push4=, $11, $pop17
+; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push16=, 65535
-; NO-SIMD128-NEXT:    i32.and $push11=, $4, $pop16
+; NO-SIMD128-NEXT:    i32.and $push8=, $2, $pop16
 ; NO-SIMD128-NEXT:    i32.const $push15=, 65535
-; NO-SIMD128-NEXT:    i32.and $push10=, $12, $pop15
+; NO-SIMD128-NEXT:    i32.and $push7=, $10, $pop15
+; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push14=, 65535
+; NO-SIMD128-NEXT:    i32.and $push11=, $1, $pop14
+; NO-SIMD128-NEXT:    i32.const $push13=, 65535
+; NO-SIMD128-NEXT:    i32.and $push10=, $9, $pop13
 ; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store 0($pop14), $pop12
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop12
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_low_u_v4i32:
@@ -11567,30 +9837,28 @@ define <4 x i32> @extmul_low_u_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop19
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $12, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $12, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop14), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop12
 ; NO-SIMD128-FAST-NEXT:    return
   %low1 = shufflevector <8 x i16> %v1, <8 x i16> undef,
            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -11621,31 +9889,29 @@ define <4 x i32> @extmul_high_u_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128:         .functype extmul_high_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push2=, $7, $pop0
-; NO-SIMD128-NEXT:    i32.const $push21=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $15, $pop21
-; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push20=, 65535
-; NO-SIMD128-NEXT:    i32.and $push5=, $6, $pop20
+; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push19=, 65535
-; NO-SIMD128-NEXT:    i32.and $push4=, $14, $pop19
-; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop19
+; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push18=, 65535
-; NO-SIMD128-NEXT:    i32.and $push8=, $5, $pop18
+; NO-SIMD128-NEXT:    i32.and $push5=, $7, $pop18
 ; NO-SIMD128-NEXT:    i32.const $push17=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $13, $pop17
-; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 12
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-NEXT:    i32.and $push4=, $15, $pop17
+; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push16=, 65535
-; NO-SIMD128-NEXT:    i32.and $push11=, $8, $pop16
+; NO-SIMD128-NEXT:    i32.and $push8=, $6, $pop16
 ; NO-SIMD128-NEXT:    i32.const $push15=, 65535
-; NO-SIMD128-NEXT:    i32.and $push10=, $16, $pop15
+; NO-SIMD128-NEXT:    i32.and $push7=, $14, $pop15
+; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push14=, 65535
+; NO-SIMD128-NEXT:    i32.and $push11=, $5, $pop14
+; NO-SIMD128-NEXT:    i32.const $push13=, 65535
+; NO-SIMD128-NEXT:    i32.and $push10=, $13, $pop13
 ; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store 0($pop14), $pop12
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop12
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_high_u_v4i32:
@@ -11653,30 +9919,28 @@ define <4 x i32> @extmul_high_u_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $5, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $13, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $13, $pop19
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $6, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $14, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $7, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $6, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $15, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $14, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $8, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $7, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $16, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $15, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $8, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $16, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop14), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop12
 ; NO-SIMD128-FAST-NEXT:    return
   %high1 = shufflevector <8 x i16> %v1, <8 x i16> undef,
            <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -13061,16 +11325,14 @@ define <4 x float> @neg_v4f32(<4 x float> %x) {
 ; NO-SIMD128-LABEL: neg_v4f32:
 ; NO-SIMD128:         .functype neg_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.neg $push0=, $3
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.neg $push1=, $2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.neg $push2=, $1
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    f32.neg $push5=, $4
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    f32.neg $push0=, $4
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.neg $push1=, $3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.neg $push2=, $2
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.neg $push3=, $1
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: neg_v4f32:
@@ -13082,10 +11344,8 @@ define <4 x float> @neg_v4f32(<4 x float> %x) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.neg $push2=, $3
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.neg $push5=, $4
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.neg $push3=, $4
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = fsub nsz <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, %x
   ret <4 x float> %a
@@ -13108,16 +11368,14 @@ define <4 x float> @abs_v4f32(<4 x float> %x) {
 ; NO-SIMD128-LABEL: abs_v4f32:
 ; NO-SIMD128:         .functype abs_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.abs $push0=, $3
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.abs $push1=, $2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.abs $push2=, $1
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    f32.abs $push5=, $4
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    f32.abs $push0=, $4
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.abs $push1=, $3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.abs $push2=, $2
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.abs $push3=, $1
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: abs_v4f32:
@@ -13129,10 +11387,8 @@ define <4 x float> @abs_v4f32(<4 x float> %x) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.abs $push2=, $3
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.abs $push5=, $4
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.abs $push3=, $4
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call <4 x float> @llvm.fabs.v4f32(<4 x float> %x)
   ret <4 x float> %a
@@ -13157,54 +11413,50 @@ define <4 x float> @min_unordered_v4f32(<4 x float> %x) {
 ; NO-SIMD128:         .functype min_unordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.gt $push1=, $3, $pop17
-; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $3, $pop1
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    f32.const $push16=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.gt $push3=, $2, $pop15
-; NO-SIMD128-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.gt $push1=, $4, $pop15
+; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $4, $pop1
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop2
 ; NO-SIMD128-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.gt $push5=, $1, $pop13
-; NO-SIMD128-NEXT:    f32.select $push6=, $pop14, $1, $pop5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    f32.gt $push3=, $3, $pop13
+; NO-SIMD128-NEXT:    f32.select $push4=, $pop14, $3, $pop3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop4
 ; NO-SIMD128-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.gt $push7=, $4, $pop11
-; NO-SIMD128-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-NEXT:    f32.gt $push5=, $2, $pop11
+; NO-SIMD128-NEXT:    f32.select $push6=, $pop12, $2, $pop5
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.gt $push7=, $1, $pop9
+; NO-SIMD128-NEXT:    f32.select $push8=, $pop10, $1, $pop7
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_unordered_v4f32:
 ; NO-SIMD128-FAST:         .functype min_unordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.gt $push1=, $1, $pop17
+; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.gt $push1=, $1, $pop15
 ; NO-SIMD128-FAST-NEXT:    f32.select $push2=, $pop0, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    f32.const $push16=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.gt $push3=, $2, $pop15
-; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.gt $push5=, $3, $pop13
-; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop14, $3, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.gt $push3=, $2, $pop13
+; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop14, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.gt $push7=, $4, $pop11
-; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    f32.gt $push5=, $3, $pop11
+; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop12, $3, $pop5
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.gt $push7=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop10, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp ule <4 x float> %x, <float 5., float 5., float 5., float 5.>
   %a = select <4 x i1> %cmps, <4 x float> %x,
@@ -13231,54 +11483,50 @@ define <4 x float> @max_unordered_v4f32(<4 x float> %x) {
 ; NO-SIMD128:         .functype max_unordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.lt $push1=, $3, $pop17
-; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $3, $pop1
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    f32.const $push16=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.lt $push3=, $2, $pop15
-; NO-SIMD128-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.lt $push1=, $4, $pop15
+; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $4, $pop1
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop2
 ; NO-SIMD128-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.lt $push5=, $1, $pop13
-; NO-SIMD128-NEXT:    f32.select $push6=, $pop14, $1, $pop5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    f32.lt $push3=, $3, $pop13
+; NO-SIMD128-NEXT:    f32.select $push4=, $pop14, $3, $pop3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop4
 ; NO-SIMD128-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.lt $push7=, $4, $pop11
-; NO-SIMD128-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-NEXT:    f32.lt $push5=, $2, $pop11
+; NO-SIMD128-NEXT:    f32.select $push6=, $pop12, $2, $pop5
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.lt $push7=, $1, $pop9
+; NO-SIMD128-NEXT:    f32.select $push8=, $pop10, $1, $pop7
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_unordered_v4f32:
 ; NO-SIMD128-FAST:         .functype max_unordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.lt $push1=, $1, $pop17
+; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.lt $push1=, $1, $pop15
 ; NO-SIMD128-FAST-NEXT:    f32.select $push2=, $pop0, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    f32.const $push16=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.lt $push3=, $2, $pop15
-; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.lt $push5=, $3, $pop13
-; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop14, $3, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.lt $push3=, $2, $pop13
+; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop14, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.lt $push7=, $4, $pop11
-; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    f32.lt $push5=, $3, $pop11
+; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop12, $3, $pop5
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.lt $push7=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop10, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp uge <4 x float> %x, <float 5., float 5., float 5., float 5.>
   %a = select <4 x i1> %cmps, <4 x float> %x,
@@ -13305,54 +11553,50 @@ define <4 x float> @min_ordered_v4f32(<4 x float> %x) {
 ; NO-SIMD128:         .functype min_ordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.ge $push1=, $3, $pop17
-; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $3, $pop1
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    f32.const $push16=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.ge $push3=, $2, $pop15
-; NO-SIMD128-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.ge $push1=, $4, $pop15
+; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $4, $pop1
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop2
 ; NO-SIMD128-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.ge $push5=, $1, $pop13
-; NO-SIMD128-NEXT:    f32.select $push6=, $pop14, $1, $pop5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    f32.ge $push3=, $3, $pop13
+; NO-SIMD128-NEXT:    f32.select $push4=, $pop14, $3, $pop3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop4
 ; NO-SIMD128-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.ge $push7=, $4, $pop11
-; NO-SIMD128-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-NEXT:    f32.ge $push5=, $2, $pop11
+; NO-SIMD128-NEXT:    f32.select $push6=, $pop12, $2, $pop5
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.ge $push7=, $1, $pop9
+; NO-SIMD128-NEXT:    f32.select $push8=, $pop10, $1, $pop7
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_ordered_v4f32:
 ; NO-SIMD128-FAST:         .functype min_ordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.ge $push1=, $1, $pop17
+; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.ge $push1=, $1, $pop15
 ; NO-SIMD128-FAST-NEXT:    f32.select $push2=, $pop0, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    f32.const $push16=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.ge $push3=, $2, $pop15
-; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.ge $push5=, $3, $pop13
-; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop14, $3, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.ge $push3=, $2, $pop13
+; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop14, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.ge $push7=, $4, $pop11
-; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    f32.ge $push5=, $3, $pop11
+; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop12, $3, $pop5
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.ge $push7=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop10, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp ole <4 x float> <float 5., float 5., float 5., float 5.>, %x
   %a = select <4 x i1> %cmps,
@@ -13379,54 +11623,50 @@ define <4 x float> @max_ordered_v4f32(<4 x float> %x) {
 ; NO-SIMD128:         .functype max_ordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.le $push1=, $3, $pop17
-; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $3, $pop1
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    f32.const $push16=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.le $push3=, $2, $pop15
-; NO-SIMD128-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.le $push1=, $4, $pop15
+; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $4, $pop1
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop2
 ; NO-SIMD128-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.le $push5=, $1, $pop13
-; NO-SIMD128-NEXT:    f32.select $push6=, $pop14, $1, $pop5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    f32.le $push3=, $3, $pop13
+; NO-SIMD128-NEXT:    f32.select $push4=, $pop14, $3, $pop3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop4
 ; NO-SIMD128-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.le $push7=, $4, $pop11
-; NO-SIMD128-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-NEXT:    f32.le $push5=, $2, $pop11
+; NO-SIMD128-NEXT:    f32.select $push6=, $pop12, $2, $pop5
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.le $push7=, $1, $pop9
+; NO-SIMD128-NEXT:    f32.select $push8=, $pop10, $1, $pop7
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_ordered_v4f32:
 ; NO-SIMD128-FAST:         .functype max_ordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.le $push1=, $1, $pop17
+; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.le $push1=, $1, $pop15
 ; NO-SIMD128-FAST-NEXT:    f32.select $push2=, $pop0, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    f32.const $push16=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.le $push3=, $2, $pop15
-; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.le $push5=, $3, $pop13
-; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop14, $3, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.le $push3=, $2, $pop13
+; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop14, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.le $push7=, $4, $pop11
-; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    f32.le $push5=, $3, $pop11
+; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop12, $3, $pop5
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.le $push7=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop10, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp oge <4 x float> <float 5., float 5., float 5., float 5.>, %x
   %a = select <4 x i1> %cmps,
@@ -13451,16 +11691,14 @@ define <4 x float> @min_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: min_intrinsic_v4f32:
 ; NO-SIMD128:         .functype min_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.min $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.min $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.min $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.min $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.min $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.min $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.min $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.min $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_intrinsic_v4f32:
@@ -13472,10 +11710,8 @@ define <4 x float> @min_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.min $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.min $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.min $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call <4 x float> @llvm.minimum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -13552,16 +11788,14 @@ define <4 x float> @minnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: minnum_intrinsic_v4f32:
 ; NO-SIMD128:         .functype minnum_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    call $push0=, fminf, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    call $push1=, fminf, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    call $push2=, fminf, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    call $push5=, fminf, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    call $push0=, fminf, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    call $push1=, fminf, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    call $push2=, fminf, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    call $push3=, fminf, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: minnum_intrinsic_v4f32:
@@ -13573,10 +11807,8 @@ define <4 x float> @minnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    call $push2=, fminf, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    call $push5=, fminf, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    call $push3=, fminf, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -13598,16 +11830,14 @@ define <4 x float> @minnum_nsz_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: minnum_nsz_intrinsic_v4f32:
 ; NO-SIMD128:         .functype minnum_nsz_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    call $push0=, fminf, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    call $push1=, fminf, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    call $push2=, fminf, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    call $push5=, fminf, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    call $push0=, fminf, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    call $push1=, fminf, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    call $push2=, fminf, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    call $push3=, fminf, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: minnum_nsz_intrinsic_v4f32:
@@ -13619,10 +11849,8 @@ define <4 x float> @minnum_nsz_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    call $push2=, fminf, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    call $push5=, fminf, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    call $push3=, fminf, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan nsz <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -13647,19 +11875,17 @@ define <4 x float> @fminnumv432_non_zero_intrinsic(<4 x float> %x) {
 ; NO-SIMD128:         .functype fminnumv432_non_zero_intrinsic (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, -0x1p0
-; NO-SIMD128-NEXT:    call $push1=, fminf, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-NEXT:    call $push2=, fminf, $2, $pop9
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
-; NO-SIMD128-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-NEXT:    call $push3=, fminf, $1, $pop8
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    call $push1=, fminf, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
 ; NO-SIMD128-NEXT:    f32.const $push7=, -0x1p0
-; NO-SIMD128-NEXT:    call $push6=, fminf, $4, $pop7
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop6
+; NO-SIMD128-NEXT:    call $push2=, fminf, $3, $pop7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-NEXT:    call $push3=, fminf, $2, $pop6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
+; NO-SIMD128-NEXT:    f32.const $push5=, -0x1p0
+; NO-SIMD128-NEXT:    call $push4=, fminf, $1, $pop5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop4
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: fminnumv432_non_zero_intrinsic:
@@ -13668,17 +11894,15 @@ define <4 x float> @fminnumv432_non_zero_intrinsic(<4 x float> %x) {
 ; NO-SIMD128-FAST-NEXT:    f32.const $push0=, -0x1p0
 ; NO-SIMD128-FAST-NEXT:    call $push1=, fminf, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push2=, fminf, $2, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push2=, fminf, $2, $pop7
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push3=, fminf, $3, $pop8
+; NO-SIMD128-FAST-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push3=, fminf, $3, $pop6
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    f32.const $push7=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push6=, fminf, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    f32.const $push5=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push4=, fminf, $4, $pop5
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float><float -1.0, float -1.0, float -1.0, float -1.0>)
   ret <4 x float> %a
@@ -13755,19 +11979,17 @@ define <4 x float> @fminnumv432_one_zero_intrinsic(<4 x float> %x) {
 ; NO-SIMD128:         .functype fminnumv432_one_zero_intrinsic (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, -0x1p0
-; NO-SIMD128-NEXT:    call $push1=, fminf, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.const $push2=, 0x0p0
-; NO-SIMD128-NEXT:    call $push3=, fminf, $2, $pop2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-NEXT:    call $push4=, fminf, $1, $pop9
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-NEXT:    call $push7=, fminf, $4, $pop8
-; NO-SIMD128-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-NEXT:    call $push1=, fminf, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-NEXT:    call $push2=, fminf, $3, $pop7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    f32.const $push3=, 0x0p0
+; NO-SIMD128-NEXT:    call $push4=, fminf, $2, $pop3
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-NEXT:    call $push5=, fminf, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop5
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: fminnumv432_one_zero_intrinsic:
@@ -13779,14 +12001,12 @@ define <4 x float> @fminnumv432_one_zero_intrinsic(<4 x float> %x) {
 ; NO-SIMD128-FAST-NEXT:    f32.const $push2=, 0x0p0
 ; NO-SIMD128-FAST-NEXT:    call $push3=, fminf, $2, $pop2
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push4=, fminf, $3, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push4=, fminf, $3, $pop7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push7=, fminf, $4, $pop8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-FAST-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push5=, fminf, $4, $pop6
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float><float -1.0, float 0.0, float -1.0, float -1.0>)
   ret <4 x float> %a
@@ -13809,16 +12029,14 @@ define <4 x float> @max_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: max_intrinsic_v4f32:
 ; NO-SIMD128:         .functype max_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.max $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.max $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.max $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.max $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.max $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.max $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.max $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.max $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_intrinsic_v4f32:
@@ -13830,10 +12048,8 @@ define <4 x float> @max_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.max $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.max $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.max $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -13910,16 +12126,14 @@ define <4 x float> @maxnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: maxnum_intrinsic_v4f32:
 ; NO-SIMD128:         .functype maxnum_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    call $push0=, fmaxf, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    call $push1=, fmaxf, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    call $push2=, fmaxf, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    call $push5=, fmaxf, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    call $push0=, fmaxf, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    call $push1=, fmaxf, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    call $push2=, fmaxf, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    call $push3=, fmaxf, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: maxnum_intrinsic_v4f32:
@@ -13931,10 +12145,8 @@ define <4 x float> @maxnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    call $push2=, fmaxf, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    call $push5=, fmaxf, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    call $push3=, fmaxf, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -13956,16 +12168,14 @@ define <4 x float> @maxnum_nsz_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: maxnum_nsz_intrinsic_v4f32:
 ; NO-SIMD128:         .functype maxnum_nsz_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    call $push0=, fmaxf, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    call $push1=, fmaxf, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    call $push2=, fmaxf, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    call $push5=, fmaxf, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    call $push0=, fmaxf, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    call $push1=, fmaxf, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    call $push2=, fmaxf, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    call $push3=, fmaxf, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: maxnum_nsz_intrinsic_v4f32:
@@ -13977,10 +12187,8 @@ define <4 x float> @maxnum_nsz_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    call $push2=, fmaxf, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    call $push5=, fmaxf, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    call $push3=, fmaxf, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan nsz <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -14057,19 +12265,17 @@ define <4 x float> @maxnum_one_zero_intrinsic_v4f32(<4 x float> %x, <4 x float>
 ; NO-SIMD128:         .functype maxnum_one_zero_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, -0x1p0
-; NO-SIMD128-NEXT:    call $push1=, fmaxf, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.const $push2=, 0x0p0
-; NO-SIMD128-NEXT:    call $push3=, fmaxf, $2, $pop2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-NEXT:    call $push4=, fmaxf, $1, $pop9
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-NEXT:    call $push7=, fmaxf, $4, $pop8
-; NO-SIMD128-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-NEXT:    call $push1=, fmaxf, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-NEXT:    call $push2=, fmaxf, $3, $pop7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    f32.const $push3=, 0x0p0
+; NO-SIMD128-NEXT:    call $push4=, fmaxf, $2, $pop3
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-NEXT:    call $push5=, fmaxf, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop5
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: maxnum_one_zero_intrinsic_v4f32:
@@ -14081,14 +12287,12 @@ define <4 x float> @maxnum_one_zero_intrinsic_v4f32(<4 x float> %x, <4 x float>
 ; NO-SIMD128-FAST-NEXT:    f32.const $push2=, 0x0p0
 ; NO-SIMD128-FAST-NEXT:    call $push3=, fmaxf, $2, $pop2
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push4=, fmaxf, $3, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push4=, fmaxf, $3, $pop7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push7=, fmaxf, $4, $pop8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-FAST-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push5=, fmaxf, $4, $pop6
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float><float -1.0, float 0.0, float -1.0, float -1.0>)
   ret <4 x float> %a
@@ -14113,19 +12317,17 @@ define <4 x float> @maxnum_non_zero_intrinsic_v4f32(<4 x float> %x, <4 x float>
 ; NO-SIMD128:         .functype maxnum_non_zero_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, -0x1p0
-; NO-SIMD128-NEXT:    call $push1=, fmaxf, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.const $push2=, 0x1p0
-; NO-SIMD128-NEXT:    call $push3=, fmaxf, $2, $pop2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-NEXT:    call $push4=, fmaxf, $1, $pop9
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-NEXT:    call $push7=, fmaxf, $4, $pop8
-; NO-SIMD128-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-NEXT:    call $push1=, fmaxf, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-NEXT:    call $push2=, fmaxf, $3, $pop7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    f32.const $push3=, 0x1p0
+; NO-SIMD128-NEXT:    call $push4=, fmaxf, $2, $pop3
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-NEXT:    call $push5=, fmaxf, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop5
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: maxnum_non_zero_intrinsic_v4f32:
@@ -14137,14 +12339,12 @@ define <4 x float> @maxnum_non_zero_intrinsic_v4f32(<4 x float> %x, <4 x float>
 ; NO-SIMD128-FAST-NEXT:    f32.const $push2=, 0x1p0
 ; NO-SIMD128-FAST-NEXT:    call $push3=, fmaxf, $2, $pop2
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push4=, fmaxf, $3, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push4=, fmaxf, $3, $pop7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push7=, fmaxf, $4, $pop8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-FAST-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push5=, fmaxf, $4, $pop6
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float><float -1.0, float 1.0, float -1.0, float -1.0>)
   ret <4 x float> %a
@@ -14240,20 +12440,18 @@ define <4 x float> @pmin_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: pmin_v4f32:
 ; NO-SIMD128:         .functype pmin_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.lt $push0=, $7, $3
-; NO-SIMD128-NEXT:    f32.select $push1=, $7, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.lt $push2=, $6, $2
-; NO-SIMD128-NEXT:    f32.select $push3=, $6, $2, $pop2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    f32.lt $push4=, $5, $1
-; NO-SIMD128-NEXT:    f32.select $push5=, $5, $1, $pop4
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    f32.lt $push6=, $8, $4
-; NO-SIMD128-NEXT:    f32.select $push7=, $8, $4, $pop6
-; NO-SIMD128-NEXT:    f32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    f32.lt $push0=, $8, $4
+; NO-SIMD128-NEXT:    f32.select $push1=, $8, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.lt $push2=, $7, $3
+; NO-SIMD128-NEXT:    f32.select $push3=, $7, $3, $pop2
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    f32.lt $push4=, $6, $2
+; NO-SIMD128-NEXT:    f32.select $push5=, $6, $2, $pop4
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    f32.lt $push6=, $5, $1
+; NO-SIMD128-NEXT:    f32.select $push7=, $5, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: pmin_v4f32:
@@ -14268,11 +12466,9 @@ define <4 x float> @pmin_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push4=, $7, $3
 ; NO-SIMD128-FAST-NEXT:    f32.select $push5=, $7, $3, $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push6=, $8, $4
 ; NO-SIMD128-FAST-NEXT:    f32.select $push7=, $8, $4, $pop6
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = fcmp olt <4 x float> %y, %x
   %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
@@ -14295,28 +12491,26 @@ define <4 x i32> @pmin_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: pmin_int_v4f32:
 ; NO-SIMD128:         .functype pmin_int_v4f32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push1=, $8
 ; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push0=, $4
 ; NO-SIMD128-NEXT:    f32.lt $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $8, $4, $pop2
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push7=, $7
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push6=, $3
-; NO-SIMD128-NEXT:    f32.lt $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $7, $3, $pop8
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop9
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push11=, $6
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push10=, $2
-; NO-SIMD128-NEXT:    f32.lt $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.select $push13=, $6, $2, $pop12
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop13
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push15=, $5
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push14=, $1
-; NO-SIMD128-NEXT:    f32.lt $push16=, $pop15, $pop14
-; NO-SIMD128-NEXT:    i32.select $push17=, $5, $1, $pop16
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop17
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop3
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push5=, $7
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push4=, $3
+; NO-SIMD128-NEXT:    f32.lt $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $7, $3, $pop6
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop7
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push9=, $6
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push8=, $2
+; NO-SIMD128-NEXT:    f32.lt $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $6, $2, $pop10
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop11
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push13=, $5
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push12=, $1
+; NO-SIMD128-NEXT:    f32.lt $push14=, $pop13, $pop12
+; NO-SIMD128-NEXT:    i32.select $push15=, $5, $1, $pop14
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: pmin_int_v4f32:
@@ -14337,13 +12531,11 @@ define <4 x i32> @pmin_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $7, $3, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push13=, $8
 ; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push12=, $4
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $8, $4, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop17), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %fx = bitcast <4 x i32> %x to <4 x float>
   %fy = bitcast <4 x i32> %y to <4 x float>
@@ -14368,20 +12560,18 @@ define <4 x float> @pmax_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: pmax_v4f32:
 ; NO-SIMD128:         .functype pmax_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.lt $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.select $push1=, $7, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.lt $push2=, $2, $6
-; NO-SIMD128-NEXT:    f32.select $push3=, $6, $2, $pop2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    f32.lt $push4=, $1, $5
-; NO-SIMD128-NEXT:    f32.select $push5=, $5, $1, $pop4
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    f32.lt $push6=, $4, $8
-; NO-SIMD128-NEXT:    f32.select $push7=, $8, $4, $pop6
-; NO-SIMD128-NEXT:    f32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    f32.lt $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.select $push1=, $8, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.lt $push2=, $3, $7
+; NO-SIMD128-NEXT:    f32.select $push3=, $7, $3, $pop2
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    f32.lt $push4=, $2, $6
+; NO-SIMD128-NEXT:    f32.select $push5=, $6, $2, $pop4
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    f32.lt $push6=, $1, $5
+; NO-SIMD128-NEXT:    f32.select $push7=, $5, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: pmax_v4f32:
@@ -14396,11 +12586,9 @@ define <4 x float> @pmax_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push4=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.select $push5=, $7, $3, $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push6=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    f32.select $push7=, $8, $4, $pop6
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = fcmp olt <4 x float> %x, %y
   %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
@@ -14423,28 +12611,26 @@ define <4 x i32> @pmax_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: pmax_int_v4f32:
 ; NO-SIMD128:         .functype pmax_int_v4f32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push1=, $4
 ; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push0=, $8
 ; NO-SIMD128-NEXT:    f32.lt $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $8, $4, $pop2
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push7=, $3
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push6=, $7
-; NO-SIMD128-NEXT:    f32.lt $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $7, $3, $pop8
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop9
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push11=, $2
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push10=, $6
-; NO-SIMD128-NEXT:    f32.lt $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.select $push13=, $6, $2, $pop12
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop13
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push15=, $1
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push14=, $5
-; NO-SIMD128-NEXT:    f32.lt $push16=, $pop15, $pop14
-; NO-SIMD128-NEXT:    i32.select $push17=, $5, $1, $pop16
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop17
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop3
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push5=, $3
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push4=, $7
+; NO-SIMD128-NEXT:    f32.lt $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $7, $3, $pop6
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop7
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push9=, $2
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push8=, $6
+; NO-SIMD128-NEXT:    f32.lt $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $6, $2, $pop10
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop11
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push13=, $1
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push12=, $5
+; NO-SIMD128-NEXT:    f32.lt $push14=, $pop13, $pop12
+; NO-SIMD128-NEXT:    i32.select $push15=, $5, $1, $pop14
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: pmax_int_v4f32:
@@ -14465,13 +12651,11 @@ define <4 x i32> @pmax_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $7, $3, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push13=, $4
 ; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push12=, $8
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $8, $4, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop17), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %fx = bitcast <4 x i32> %x to <4 x float>
   %fy = bitcast <4 x i32> %y to <4 x float>
@@ -14496,16 +12680,14 @@ define <4 x float> @add_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: add_v4f32:
 ; NO-SIMD128:         .functype add_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.add $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.add $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.add $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.add $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.add $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.add $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.add $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.add $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: add_v4f32:
@@ -14517,10 +12699,8 @@ define <4 x float> @add_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.add $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.add $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.add $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = fadd <4 x float> %x, %y
   ret <4 x float> %a
@@ -14542,16 +12722,14 @@ define <4 x float> @sub_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: sub_v4f32:
 ; NO-SIMD128:         .functype sub_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.sub $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.sub $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.sub $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.sub $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.sub $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.sub $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.sub $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.sub $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: sub_v4f32:
@@ -14563,10 +12741,8 @@ define <4 x float> @sub_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.sub $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.sub $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.sub $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = fsub <4 x float> %x, %y
   ret <4 x float> %a
@@ -14588,16 +12764,14 @@ define <4 x float> @div_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: div_v4f32:
 ; NO-SIMD128:         .functype div_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.div $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.div $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.div $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.div $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.div $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.div $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.div $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.div $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: div_v4f32:
@@ -14609,10 +12783,8 @@ define <4 x float> @div_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.div $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.div $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.div $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = fdiv <4 x float> %x, %y
   ret <4 x float> %a
@@ -14634,16 +12806,14 @@ define <4 x float> @mul_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: mul_v4f32:
 ; NO-SIMD128:         .functype mul_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.mul $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.mul $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.mul $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.mul $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.mul $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.mul $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.mul $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.mul $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: mul_v4f32:
@@ -14655,10 +12825,8 @@ define <4 x float> @mul_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.mul $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.mul $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.mul $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = fmul <4 x float> %x, %y
   ret <4 x float> %a
@@ -14681,16 +12849,14 @@ define <4 x float> @sqrt_v4f32(<4 x float> %x) {
 ; NO-SIMD128-LABEL: sqrt_v4f32:
 ; NO-SIMD128:         .functype sqrt_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.sqrt $push0=, $3
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.sqrt $push1=, $2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.sqrt $push2=, $1
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    f32.sqrt $push5=, $4
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    f32.sqrt $push0=, $4
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.sqrt $push1=, $3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.sqrt $push2=, $2
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.sqrt $push3=, $1
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: sqrt_v4f32:
@@ -14702,10 +12868,8 @@ define <4 x float> @sqrt_v4f32(<4 x float> %x) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.sqrt $push2=, $3
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.sqrt $push5=, $4
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.sqrt $push3=, $4
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
   ret <4 x float> %a
diff --git a/llvm/test/CodeGen/WebAssembly/simd.ll b/llvm/test/CodeGen/WebAssembly/simd.ll
index d2a38de..5ec9f6a 100644
--- a/llvm/test/CodeGen/WebAssembly/simd.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd.ll
@@ -38,44 +38,22 @@ define <16 x i8> @splat_v16i8(i8 %x) {
 ; NO-SIMD128-LABEL: splat_v16i8:
 ; NO-SIMD128:         .functype splat_v16i8 (i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $1
+; NO-SIMD128-NEXT:    i32.store8 14($0), $1
+; NO-SIMD128-NEXT:    i32.store8 13($0), $1
+; NO-SIMD128-NEXT:    i32.store8 12($0), $1
+; NO-SIMD128-NEXT:    i32.store8 11($0), $1
+; NO-SIMD128-NEXT:    i32.store8 10($0), $1
+; NO-SIMD128-NEXT:    i32.store8 9($0), $1
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $1
+; NO-SIMD128-NEXT:    i32.store8 7($0), $1
+; NO-SIMD128-NEXT:    i32.store8 6($0), $1
+; NO-SIMD128-NEXT:    i32.store8 5($0), $1
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $1
+; NO-SIMD128-NEXT:    i32.store8 3($0), $1
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $1
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $1
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $1
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $1
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $1
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $1
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $1
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $1
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $1
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $1
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $1
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $1
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $1
 ; NO-SIMD128-NEXT:    return
   %v = insertelement <16 x i8> undef, i8 %x, i32 0
   %res = shufflevector <16 x i8> %v, <16 x i8> undef,
@@ -356,44 +334,22 @@ define <16 x i8> @replace_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-LABEL: replace_v16i8:
 ; NO-SIMD128:         .functype replace_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $16
+; NO-SIMD128-NEXT:    i32.store8 14($0), $15
+; NO-SIMD128-NEXT:    i32.store8 13($0), $14
+; NO-SIMD128-NEXT:    i32.store8 12($0), $13
+; NO-SIMD128-NEXT:    i32.store8 11($0), $17
+; NO-SIMD128-NEXT:    i32.store8 10($0), $11
+; NO-SIMD128-NEXT:    i32.store8 9($0), $10
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $9
+; NO-SIMD128-NEXT:    i32.store8 7($0), $8
+; NO-SIMD128-NEXT:    i32.store8 6($0), $7
+; NO-SIMD128-NEXT:    i32.store8 5($0), $6
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $5
+; NO-SIMD128-NEXT:    i32.store8 3($0), $4
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $3
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $16
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $15
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $14
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $13
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $17
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $11
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $10
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $8
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $7
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $6
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <16 x i8> %v, i8 %x, i32 11
   ret <16 x i8> %res
@@ -461,44 +417,22 @@ define <16 x i8> @replace_zero_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-LABEL: replace_zero_v16i8:
 ; NO-SIMD128:         .functype replace_zero_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $16
+; NO-SIMD128-NEXT:    i32.store8 14($0), $15
+; NO-SIMD128-NEXT:    i32.store8 13($0), $14
+; NO-SIMD128-NEXT:    i32.store8 12($0), $13
+; NO-SIMD128-NEXT:    i32.store8 11($0), $12
+; NO-SIMD128-NEXT:    i32.store8 10($0), $11
+; NO-SIMD128-NEXT:    i32.store8 9($0), $10
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $9
+; NO-SIMD128-NEXT:    i32.store8 7($0), $8
+; NO-SIMD128-NEXT:    i32.store8 6($0), $7
+; NO-SIMD128-NEXT:    i32.store8 5($0), $6
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $5
+; NO-SIMD128-NEXT:    i32.store8 3($0), $4
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $3
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $17
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $16
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $15
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $14
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $13
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $12
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $11
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $10
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $8
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $7
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $6
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <16 x i8> %v, i8 %x, i32 0
   ret <16 x i8> %res
@@ -514,44 +448,22 @@ define <16 x i8> @shuffle_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: shuffle_v16i8:
 ; NO-SIMD128:         .functype shuffle_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $32
+; NO-SIMD128-NEXT:    i32.store8 14($0), $15
+; NO-SIMD128-NEXT:    i32.store8 13($0), $30
+; NO-SIMD128-NEXT:    i32.store8 12($0), $13
+; NO-SIMD128-NEXT:    i32.store8 11($0), $28
+; NO-SIMD128-NEXT:    i32.store8 10($0), $11
+; NO-SIMD128-NEXT:    i32.store8 9($0), $26
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $9
+; NO-SIMD128-NEXT:    i32.store8 7($0), $24
+; NO-SIMD128-NEXT:    i32.store8 6($0), $7
+; NO-SIMD128-NEXT:    i32.store8 5($0), $22
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $5
+; NO-SIMD128-NEXT:    i32.store8 3($0), $20
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $3
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $18
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $32
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $15
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $30
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $13
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $28
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $11
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $26
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $24
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $7
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $22
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $20
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <16 x i8> %x, <16 x i8> %y,
     <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23,
@@ -569,44 +481,22 @@ define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: shuffle_undef_v16i8:
 ; NO-SIMD128:         .functype shuffle_undef_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $2
+; NO-SIMD128-NEXT:    i32.store8 14($0), $2
+; NO-SIMD128-NEXT:    i32.store8 13($0), $2
+; NO-SIMD128-NEXT:    i32.store8 12($0), $2
+; NO-SIMD128-NEXT:    i32.store8 11($0), $2
+; NO-SIMD128-NEXT:    i32.store8 10($0), $2
+; NO-SIMD128-NEXT:    i32.store8 9($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $2
+; NO-SIMD128-NEXT:    i32.store8 7($0), $2
+; NO-SIMD128-NEXT:    i32.store8 6($0), $2
+; NO-SIMD128-NEXT:    i32.store8 5($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $2
+; NO-SIMD128-NEXT:    i32.store8 3($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $2
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $2
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $2
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $2
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $2
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $2
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $2
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $2
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $2
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $2
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $2
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $2
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <16 x i8> %x, <16 x i8> %y,
     <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef,
@@ -641,44 +531,22 @@ define <16 x i8> @build_v16i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3,
 ; NO-SIMD128-LABEL: build_v16i8:
 ; NO-SIMD128:         .functype build_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $16
+; NO-SIMD128-NEXT:    i32.store8 14($0), $15
+; NO-SIMD128-NEXT:    i32.store8 13($0), $14
+; NO-SIMD128-NEXT:    i32.store8 12($0), $13
+; NO-SIMD128-NEXT:    i32.store8 11($0), $12
+; NO-SIMD128-NEXT:    i32.store8 10($0), $11
+; NO-SIMD128-NEXT:    i32.store8 9($0), $10
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $9
+; NO-SIMD128-NEXT:    i32.store8 7($0), $8
+; NO-SIMD128-NEXT:    i32.store8 6($0), $7
+; NO-SIMD128-NEXT:    i32.store8 5($0), $6
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $5
+; NO-SIMD128-NEXT:    i32.store8 3($0), $4
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $3
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $16
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $15
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $14
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $13
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $12
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $11
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $10
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $8
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $7
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $6
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $4
 ; NO-SIMD128-NEXT:    return
                               i8 %x4, i8 %x5, i8 %x6, i8 %x7,
                               i8 %x8, i8 %x9, i8 %x10, i8 %x11,
@@ -734,22 +602,14 @@ define <8 x i16> @splat_v8i16(i16 %x) {
 ; NO-SIMD128-LABEL: splat_v8i16:
 ; NO-SIMD128:         .functype splat_v8i16 (i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $1
+; NO-SIMD128-NEXT:    i32.store16 12($0), $1
+; NO-SIMD128-NEXT:    i32.store16 10($0), $1
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $1
+; NO-SIMD128-NEXT:    i32.store16 6($0), $1
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $1
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $1
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $1
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $1
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $1
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $1
 ; NO-SIMD128-NEXT:    return
   %v = insertelement <8 x i16> undef, i16 %x, i32 0
   %res = shufflevector <8 x i16> %v, <8 x i16> undef,
@@ -1016,22 +876,14 @@ define <8 x i16> @replace_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-LABEL: replace_v8i16:
 ; NO-SIMD128:         .functype replace_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $9
+; NO-SIMD128-NEXT:    i32.store16 12($0), $7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $6
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $5
+; NO-SIMD128-NEXT:    i32.store16 6($0), $4
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $3
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $9
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $7
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $6
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <8 x i16> %v, i16 %x, i32 7
   ret <8 x i16> %res
@@ -1095,22 +947,14 @@ define <8 x i16> @replace_zero_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-LABEL: replace_zero_v8i16:
 ; NO-SIMD128:         .functype replace_zero_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $8
+; NO-SIMD128-NEXT:    i32.store16 12($0), $7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $6
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $5
+; NO-SIMD128-NEXT:    i32.store16 6($0), $4
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $3
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $9
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $8
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $7
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $6
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <8 x i16> %v, i16 %x, i32 0
   ret <8 x i16> %res
@@ -1126,22 +970,14 @@ define <8 x i16> @shuffle_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: shuffle_v8i16:
 ; NO-SIMD128:         .functype shuffle_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $16
+; NO-SIMD128-NEXT:    i32.store16 12($0), $7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $14
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $5
+; NO-SIMD128-NEXT:    i32.store16 6($0), $12
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $3
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $10
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $16
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $7
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $14
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $12
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <8 x i16> %x, <8 x i16> %y,
     <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
@@ -1158,22 +994,14 @@ define <8 x i16> @shuffle_undef_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: shuffle_undef_v8i16:
 ; NO-SIMD128:         .functype shuffle_undef_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $2
+; NO-SIMD128-NEXT:    i32.store16 12($0), $2
+; NO-SIMD128-NEXT:    i32.store16 10($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $2
+; NO-SIMD128-NEXT:    i32.store16 6($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $2
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $2
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $2
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $2
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $2
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <8 x i16> %x, <8 x i16> %y,
     <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef,
@@ -1198,22 +1026,14 @@ define <8 x i16> @build_v8i16(i16 %x0, i16 %x1, i16 %x2, i16 %x3,
 ; NO-SIMD128-LABEL: build_v8i16:
 ; NO-SIMD128:         .functype build_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $8
+; NO-SIMD128-NEXT:    i32.store16 12($0), $7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $6
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $5
+; NO-SIMD128-NEXT:    i32.store16 6($0), $4
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $3
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $8
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $7
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $6
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $4
 ; NO-SIMD128-NEXT:    return
                               i16 %x4, i16 %x5, i16 %x6, i16 %x7) {
   %t0 = insertelement <8 x i16> undef, i16 %x0, i32 0
@@ -1258,12 +1078,10 @@ define <4 x i32> @splat_v4i32(i32 %x) {
 ; NO-SIMD128-LABEL: splat_v4i32:
 ; NO-SIMD128:         .functype splat_v4i32 (i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $1
 ; NO-SIMD128-NEXT:    i32.store 8($0), $1
 ; NO-SIMD128-NEXT:    i32.store 4($0), $1
 ; NO-SIMD128-NEXT:    i32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $1
 ; NO-SIMD128-NEXT:    return
   %v = insertelement <4 x i32> undef, i32 %x, i32 0
   %res = shufflevector <4 x i32> %v, <4 x i32> undef,
@@ -1368,12 +1186,10 @@ define <4 x i32> @replace_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-LABEL: replace_v4i32:
 ; NO-SIMD128:         .functype replace_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $4
 ; NO-SIMD128-NEXT:    i32.store 8($0), $5
 ; NO-SIMD128-NEXT:    i32.store 4($0), $2
 ; NO-SIMD128-NEXT:    i32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <4 x i32> %v, i32 %x, i32 2
   ret <4 x i32> %res
@@ -1433,12 +1249,10 @@ define <4 x i32> @replace_zero_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-LABEL: replace_zero_v4i32:
 ; NO-SIMD128:         .functype replace_zero_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $4
 ; NO-SIMD128-NEXT:    i32.store 8($0), $3
 ; NO-SIMD128-NEXT:    i32.store 4($0), $2
 ; NO-SIMD128-NEXT:    i32.store 0($0), $5
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <4 x i32> %v, i32 %x, i32 0
   ret <4 x i32> %res
@@ -1454,12 +1268,10 @@ define <4 x i32> @shuffle_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: shuffle_v4i32:
 ; NO-SIMD128:         .functype shuffle_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $8
 ; NO-SIMD128-NEXT:    i32.store 8($0), $3
 ; NO-SIMD128-NEXT:    i32.store 4($0), $6
 ; NO-SIMD128-NEXT:    i32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $8
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <4 x i32> %x, <4 x i32> %y,
     <4 x i32> <i32 0, i32 5, i32 2, i32 7>
@@ -1476,12 +1288,10 @@ define <4 x i32> @shuffle_undef_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: shuffle_undef_v4i32:
 ; NO-SIMD128:         .functype shuffle_undef_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $2
 ; NO-SIMD128-NEXT:    i32.store 8($0), $2
 ; NO-SIMD128-NEXT:    i32.store 4($0), $2
 ; NO-SIMD128-NEXT:    i32.store 0($0), $2
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $2
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <4 x i32> %x, <4 x i32> %y,
     <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -1501,12 +1311,10 @@ define <4 x i32> @build_v4i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
 ; NO-SIMD128-LABEL: build_v4i32:
 ; NO-SIMD128:         .functype build_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $4
 ; NO-SIMD128-NEXT:    i32.store 8($0), $3
 ; NO-SIMD128-NEXT:    i32.store 4($0), $2
 ; NO-SIMD128-NEXT:    i32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %t0 = insertelement <4 x i32> undef, i32 %x0, i32 0
   %t1 = insertelement <4 x i32> %t0, i32 %x1, i32 1
@@ -1801,12 +1609,10 @@ define <4 x float> @splat_v4f32(float %x) {
 ; NO-SIMD128-LABEL: splat_v4f32:
 ; NO-SIMD128:         .functype splat_v4f32 (i32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $1
 ; NO-SIMD128-NEXT:    f32.store 8($0), $1
 ; NO-SIMD128-NEXT:    f32.store 4($0), $1
 ; NO-SIMD128-NEXT:    f32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $1
 ; NO-SIMD128-NEXT:    return
   %v = insertelement <4 x float> undef, float %x, i32 0
   %res = shufflevector <4 x float> %v, <4 x float> undef,
@@ -1911,12 +1717,10 @@ define <4 x float> @replace_v4f32(<4 x float> %v, float %x) {
 ; NO-SIMD128-LABEL: replace_v4f32:
 ; NO-SIMD128:         .functype replace_v4f32 (i32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $4
 ; NO-SIMD128-NEXT:    f32.store 8($0), $5
 ; NO-SIMD128-NEXT:    f32.store 4($0), $2
 ; NO-SIMD128-NEXT:    f32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <4 x float> %v, float %x, i32 2
   ret <4 x float> %res
@@ -1976,12 +1780,10 @@ define <4 x float> @replace_zero_v4f32(<4 x float> %v, float %x) {
 ; NO-SIMD128-LABEL: replace_zero_v4f32:
 ; NO-SIMD128:         .functype replace_zero_v4f32 (i32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $4
 ; NO-SIMD128-NEXT:    f32.store 8($0), $3
 ; NO-SIMD128-NEXT:    f32.store 4($0), $2
 ; NO-SIMD128-NEXT:    f32.store 0($0), $5
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <4 x float> %v, float %x, i32 0
   ret <4 x float> %res
@@ -1997,12 +1799,10 @@ define <4 x float> @shuffle_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: shuffle_v4f32:
 ; NO-SIMD128:         .functype shuffle_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $8
 ; NO-SIMD128-NEXT:    f32.store 8($0), $3
 ; NO-SIMD128-NEXT:    f32.store 4($0), $6
 ; NO-SIMD128-NEXT:    f32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $8
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <4 x float> %x, <4 x float> %y,
     <4 x i32> <i32 0, i32 5, i32 2, i32 7>
@@ -2019,12 +1819,10 @@ define <4 x float> @shuffle_undef_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: shuffle_undef_v4f32:
 ; NO-SIMD128:         .functype shuffle_undef_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $2
 ; NO-SIMD128-NEXT:    f32.store 8($0), $2
 ; NO-SIMD128-NEXT:    f32.store 4($0), $2
 ; NO-SIMD128-NEXT:    f32.store 0($0), $2
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $2
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <4 x float> %x, <4 x float> %y,
     <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -2044,12 +1842,10 @@ define <4 x float> @build_v4f32(float %x0, float %x1, float %x2, float %x3) {
 ; NO-SIMD128-LABEL: build_v4f32:
 ; NO-SIMD128:         .functype build_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $4
 ; NO-SIMD128-NEXT:    f32.store 8($0), $3
 ; NO-SIMD128-NEXT:    f32.store 4($0), $2
 ; NO-SIMD128-NEXT:    f32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %t0 = insertelement <4 x float> undef, float %x0, i32 0
   %t1 = insertelement <4 x float> %t0, float %x1, i32 1
diff --git a/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll b/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll
index 609be3b..50e736a 100644
--- a/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll
+++ b/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s | FileCheck %s
 
 ; Check that the shr(shl X, 56), 48) is not mistakenly turned into
@@ -16,11 +17,13 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-unknown-linux-gnu"
 
 define i64 @foo(i64 %b) nounwind readnone {
-entry:
 ; CHECK-LABEL: foo:
-; CHECK: movsbq %dil, %rax
-; CHECK: shlq $8, %rax
-; CHECK: orq $1, %rax
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movsbq %dil, %rax
+; CHECK-NEXT:    shlq $8, %rax
+; CHECK-NEXT:    incq %rax
+; CHECK-NEXT:    retq
+entry:
 	%shl = shl i64 %b, 56		; <i64> [#uses=1]
 	%shr = ashr i64 %shl, 48		; <i64> [#uses=1]
 	%add5 = or i64 %shr, 1		; <i64> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/AppendingLinkage.ll b/llvm/test/CodeGen/X86/AppendingLinkage.ll
index 83bfbe8..ace5d19 100644
--- a/llvm/test/CodeGen/X86/AppendingLinkage.ll
+++ b/llvm/test/CodeGen/X86/AppendingLinkage.ll
@@ -1,4 +1,4 @@
 ; RUN: not --crash llc < %s -mtriple=i686-- 2>&1 | FileCheck %s
 
-; CHECK: unknown special variable
+; CHECK: unknown special variable with appending linkage
 @foo = appending constant [1 x i32 ]zeroinitializer
diff --git a/llvm/test/CodeGen/X86/combine-pavg.ll b/llvm/test/CodeGen/X86/combine-pavg.ll
index 7a8ddf5..cb2d426 100644
--- a/llvm/test/CodeGen/X86/combine-pavg.ll
+++ b/llvm/test/CodeGen/X86/combine-pavg.ll
@@ -84,25 +84,22 @@ define <16 x i8> @combine_pavgw_knownbits(<8 x i16> %a0, <8 x i16> %a1, <8 x i16
 define <8 x i16> @combine_pavgw_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
 ; SSE-LABEL: combine_pavgw_demandedelts:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,8,9,12,13,12,13]
 ; SSE-NEXT:    pavgw %xmm1, %xmm0
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: combine_pavgw_demandedelts:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,8,9,12,13,12,13]
 ; AVX1-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: combine_pavgw_demandedelts:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
-; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
 ; AVX2-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %s0 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   %avg = tail call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %s0, <8 x i16> %a1)
diff --git a/llvm/test/CodeGen/X86/evex-to-vex-compress.mir b/llvm/test/CodeGen/X86/evex-to-vex-compress.mir
index 548cf24..13c9585 100644
--- a/llvm/test/CodeGen/X86/evex-to-vex-compress.mir
+++ b/llvm/test/CodeGen/X86/evex-to-vex-compress.mir
@@ -869,13 +869,13 @@ body: |
   $ymm0 = VSHUFPSZ256rmi                       $ymm0, $rdi, 1, $noreg, 0, $noreg, -24
   ; CHECK: $ymm0 = VSHUFPSYrri                 $ymm0, $ymm1, -24
   $ymm0 = VSHUFPSZ256rri                       $ymm0, $ymm1, -24
-  ; CHECK: $ymm0 = VROUNDPDYm                  $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $ymm0 = VROUNDPDYmi                 $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $ymm0 = VRNDSCALEPDZ256rmi                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $ymm0 = VROUNDPDYr                  $ymm0, 15, implicit $mxcsr
+  ; CHECK: $ymm0 = VROUNDPDYri                 $ymm0, 15, implicit $mxcsr
   $ymm0 = VRNDSCALEPDZ256rri                   $ymm0, 15, implicit $mxcsr
-  ; CHECK: $ymm0 = VROUNDPSYm                  $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $ymm0 = VROUNDPSYmi                 $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $ymm0 = VRNDSCALEPSZ256rmi                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $ymm0 = VROUNDPSYr                  $ymm0, 15, implicit $mxcsr
+  ; CHECK: $ymm0 = VROUNDPSYri                 $ymm0, 15, implicit $mxcsr
   $ymm0 = VRNDSCALEPSZ256rri                   $ymm0, 15, implicit $mxcsr
   ; CHECK: $ymm0 = VPERM2F128rm                $ymm0, $rip, 1, $noreg, 0, $noreg, 32
   $ymm0 = VSHUFF32X4Z256rmi                    $ymm0, $rip, 1, $noreg, 0, $noreg, 228
@@ -1751,13 +1751,13 @@ body: |
   $xmm0 = VALIGNQZ128rmi                       $xmm0, $rip, 1, $noreg, 0, $noreg, 1
   ; CHECK: $xmm0 = VPALIGNRrri                 $xmm0, $xmm1, 8
   $xmm0 = VALIGNQZ128rri                       $xmm0, $xmm1, 1
-  ; CHECK: $xmm0 = VROUNDPDm                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDPDmi                  $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALEPDZ128rmi                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDPDr                   $xmm0, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDPDri                  $xmm0, 15, implicit $mxcsr
   $xmm0 = VRNDSCALEPDZ128rri                   $xmm0, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDPSm                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDPSmi                  $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALEPSZ128rmi                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDPSr                   $xmm0, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDPSri                  $xmm0, 15, implicit $mxcsr
   $xmm0 = VRNDSCALEPSZ128rri                   $xmm0, 15, implicit $mxcsr
 
   RET64
@@ -2308,21 +2308,21 @@ body: |
   $xmm0 = VINSERTPSZrm                         $xmm0, $rdi, 1, $noreg, 0, $noreg, 1
   ; CHECK: $xmm0 = VINSERTPSrr                 $xmm0, $xmm0, 1
   $xmm0 = VINSERTPSZrr                         $xmm0, $xmm0, 1
-  ; CHECK: $xmm0 = VROUNDSDm                   $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSDmi                  $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESDZm                        $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSDr                   $xmm0, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSDri                  $xmm0, $xmm1, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESDZr                        $xmm0, $xmm1, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSSm                   $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSSmi                  $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESSZm                        $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSSr                   $xmm0, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSSri                  $xmm0, $xmm1, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESSZr                        $xmm0, $xmm1, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSDm_Int               $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSDmi_Int              $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESDZm_Int                    $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSDr_Int               $xmm0, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSDri_Int              $xmm0, $xmm1, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESDZr_Int                    $xmm0, $xmm1, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSSm_Int               $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSSmi_Int              $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESSZm_Int                    $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSSr_Int               $xmm0, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSSri_Int              $xmm0, $xmm1, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESSZr_Int                    $xmm0, $xmm1, 15, implicit $mxcsr
 
   RET64
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index d9ee5f0..ee7f4ae 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -173,16 +173,14 @@ define void @freeze_extractelement(ptr %origin0, ptr %origin1, ptr %dst) nounwin
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    vmovdqa (%edx), %xmm0
 ; X86-NEXT:    vpand (%ecx), %xmm0, %xmm0
-; X86-NEXT:    vpextrb $6, %xmm0, %ecx
-; X86-NEXT:    movb %cl, (%eax)
+; X86-NEXT:    vpextrb $6, %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_extractelement:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqa (%rdi), %xmm0
 ; X64-NEXT:    vpand (%rsi), %xmm0, %xmm0
-; X64-NEXT:    vpextrb $6, %xmm0, %eax
-; X64-NEXT:    movb %al, (%rdx)
+; X64-NEXT:    vpextrb $6, %xmm0, (%rdx)
 ; X64-NEXT:    retq
   %i0 = load <16 x i8>, ptr %origin0
   %i1 = load <16 x i8>, ptr %origin1
diff --git a/llvm/test/CodeGen/X86/indirect-branch-tracking-eh2.ll b/llvm/test/CodeGen/X86/indirect-branch-tracking-eh2.ll
index 64d44d9..0123431 100644
--- a/llvm/test/CodeGen/X86/indirect-branch-tracking-eh2.ll
+++ b/llvm/test/CodeGen/X86/indirect-branch-tracking-eh2.ll
@@ -1,59 +1,183 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple x86_64-unknown-unknown -exception-model sjlj -verify-machineinstrs=0 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck %s --check-prefix=NUM
 ; RUN: llc -mtriple x86_64-unknown-unknown -exception-model sjlj -verify-machineinstrs=0 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck %s --check-prefix=SJLJ
 
-; NUM-COUNT-3: endbr64
-
-;SJLJ:       main:                                  # @main
-;SJLJ-NEXT: .Lfunc_begin0:
-;SJLJ-NEXT: # %bb.0:                                # %entry
-;SJLJ-NEXT:         endbr64
-;SJLJ-NEXT:         pushq   %rbp
-;SJLJ:               callq   _Unwind_SjLj_Register
-;SJLJ-NEXT: .Ltmp0:
-;SJLJ-NEXT:         callq   _Z3foov
-;SJLJ-NEXT: .Ltmp1:
-;SJLJ-NEXT: # %bb.1:                                # %invoke.cont
-;SJLJ-NEXT:         movl
-;SJLJ-NEXT: .LBB0_7:                                # %return
-;SJLJ:               callq   _Unwind_SjLj_Unregister
-;SJLJ:               retq
-;SJLJ-NEXT: .LBB0_9:
-;SJLJ-NEXT:         endbr64
-;SJLJ-NEXT:         movl
-;SJLJ-NEXT:         cmpl
-;SJLJ-NEXT:         jb      .LBB0_10
-;SJLJ-NEXT: # %bb.11:
-;SJLJ-NEXT:         ud2
-;SJLJ-NEXT: .LBB0_10:
-;SJLJ-NEXT:         leaq    .LJTI0_0(%rip), %rcx
-;SJLJ-NEXT:         jmpq    *(%rcx,%rax,8)
-;SJLJ-NEXT: .LBB0_2:                                # %lpad
-;SJLJ-NEXT: .Ltmp2:
-;SJLJ-NEXT:         endbr64
-;SJLJ:               jne     .LBB0_4
-;SJLJ-NEXT: # %bb.3:                                # %catch3
-;SJLJ:               callq   __cxa_begin_catch
-;SJLJ:               jmp     .LBB0_6
-;SJLJ-NEXT: .LBB0_4:                                # %catch.fallthrough
-;SJLJ-NEXT:         cmpl
-;SJLJ-NEXT:         jne     .LBB0_8
-;SJLJ-NEXT: # %bb.5:                                # %catch
-;SJLJ:               callq   __cxa_begin_catch
-;SJLJ:               cmpb
-;SJLJ-NEXT: .LBB0_6:                                # %return
-;SJLJ:               callq   __cxa_end_catch
-;SJLJ-NEXT:         jmp     .LBB0_7
-;SJLJ-NEXT: .LBB0_8:                                # %eh.resume
-;SJLJ-NEXT:         movl
-;SJLJ-NEXT: .Lfunc_end0:
-;SJLJ:      .LJTI0_0:
-;SJLJ-NEXT:         .quad   .LBB0_2
-
 @_ZTIi = external dso_local constant ptr
 @_ZTIc = external dso_local constant ptr
 
 ; Function Attrs: noinline norecurse optnone uwtable
 define dso_local i32 @main() #0 personality ptr @__gxx_personality_sj0 {
+; NUM-LABEL: main:
+; NUM:       # %bb.0: # %entry
+; NUM-NEXT:    endbr64
+; NUM-NEXT:    pushq %rbp
+; NUM-NEXT:    movq %rsp, %rbp
+; NUM-NEXT:    pushq %r15
+; NUM-NEXT:    pushq %r14
+; NUM-NEXT:    pushq %r13
+; NUM-NEXT:    pushq %r12
+; NUM-NEXT:    pushq %rbx
+; NUM-NEXT:    subq $120, %rsp
+; NUM-NEXT:    movl $0, -44(%rbp)
+; NUM-NEXT:    movq $__gxx_personality_sj0, -120(%rbp)
+; NUM-NEXT:    movq $GCC_except_table0, -112(%rbp)
+; NUM-NEXT:    movq %rbp, -104(%rbp)
+; NUM-NEXT:    movq %rsp, -88(%rbp)
+; NUM-NEXT:    movq $.LBB0_9, -96(%rbp)
+; NUM-NEXT:    movl $1, -144(%rbp)
+; NUM-NEXT:    leaq -152(%rbp), %rdi
+; NUM-NEXT:    callq _Unwind_SjLj_Register@PLT
+; NUM-NEXT:  .Ltmp0:
+; NUM-NEXT:    callq _Z3foov
+; NUM-NEXT:  .Ltmp1:
+; NUM-NEXT:  # %bb.1: # %invoke.cont
+; NUM-NEXT:    movl $1, -44(%rbp)
+; NUM-NEXT:  .LBB0_7: # %return
+; NUM-NEXT:    movl -44(%rbp), %ebx
+; NUM-NEXT:    leaq -152(%rbp), %rdi
+; NUM-NEXT:    callq _Unwind_SjLj_Unregister@PLT
+; NUM-NEXT:    movl %ebx, %eax
+; NUM-NEXT:    addq $120, %rsp
+; NUM-NEXT:    popq %rbx
+; NUM-NEXT:    popq %r12
+; NUM-NEXT:    popq %r13
+; NUM-NEXT:    popq %r14
+; NUM-NEXT:    popq %r15
+; NUM-NEXT:    popq %rbp
+; NUM-NEXT:    retq
+; NUM-NEXT:  .LBB0_9:
+; NUM-NEXT:    endbr64
+; NUM-NEXT:    movl -144(%rbp), %eax
+; NUM-NEXT:    cmpl $1, %eax
+; NUM-NEXT:    jb .LBB0_10
+; NUM-NEXT:  # %bb.11:
+; NUM-NEXT:    ud2
+; NUM-NEXT:  .LBB0_10:
+; NUM-NEXT:    leaq .LJTI0_0(%rip), %rcx
+; NUM-NEXT:    jmpq *(%rcx,%rax,8)
+; NUM-NEXT:  .LBB0_2: # %lpad
+; NUM-NEXT:  .Ltmp2:
+; NUM-NEXT:    endbr64
+; NUM-NEXT:    movl -140(%rbp), %ecx
+; NUM-NEXT:    movl -136(%rbp), %eax
+; NUM-NEXT:    movq %rcx, -56(%rbp)
+; NUM-NEXT:    movl %eax, -64(%rbp)
+; NUM-NEXT:    cmpl $2, %eax
+; NUM-NEXT:    jne .LBB0_4
+; NUM-NEXT:  # %bb.3: # %catch3
+; NUM-NEXT:    movq -56(%rbp), %rdi
+; NUM-NEXT:    movl $-1, -144(%rbp)
+; NUM-NEXT:    callq __cxa_begin_catch
+; NUM-NEXT:    movl (%rax), %eax
+; NUM-NEXT:    movl %eax, -60(%rbp)
+; NUM-NEXT:    xorl %ecx, %ecx
+; NUM-NEXT:    cmpl $5, %eax
+; NUM-NEXT:    jmp .LBB0_6
+; NUM-NEXT:  .LBB0_4: # %catch.fallthrough
+; NUM-NEXT:    cmpl $1, %eax
+; NUM-NEXT:    jne .LBB0_8
+; NUM-NEXT:  # %bb.5: # %catch
+; NUM-NEXT:    movq -56(%rbp), %rdi
+; NUM-NEXT:    movl $-1, -144(%rbp)
+; NUM-NEXT:    callq __cxa_begin_catch
+; NUM-NEXT:    movzbl (%rax), %eax
+; NUM-NEXT:    movb %al, -45(%rbp)
+; NUM-NEXT:    xorl %ecx, %ecx
+; NUM-NEXT:    cmpb $3, %al
+; NUM-NEXT:  .LBB0_6: # %return
+; NUM-NEXT:    setne %cl
+; NUM-NEXT:    movl %ecx, -44(%rbp)
+; NUM-NEXT:    movl $-1, -144(%rbp)
+; NUM-NEXT:    callq __cxa_end_catch
+; NUM-NEXT:    jmp .LBB0_7
+; NUM-NEXT:  .LBB0_8: # %eh.resume
+; NUM-NEXT:    movl $-1, -144(%rbp)
+;
+; SJLJ-LABEL: main:
+; SJLJ:       # %bb.0: # %entry
+; SJLJ-NEXT:    endbr64
+; SJLJ-NEXT:    pushq %rbp
+; SJLJ-NEXT:    movq %rsp, %rbp
+; SJLJ-NEXT:    pushq %r15
+; SJLJ-NEXT:    pushq %r14
+; SJLJ-NEXT:    pushq %r13
+; SJLJ-NEXT:    pushq %r12
+; SJLJ-NEXT:    pushq %rbx
+; SJLJ-NEXT:    subq $120, %rsp
+; SJLJ-NEXT:    movl $0, -44(%rbp)
+; SJLJ-NEXT:    movq $__gxx_personality_sj0, -120(%rbp)
+; SJLJ-NEXT:    movq $GCC_except_table0, -112(%rbp)
+; SJLJ-NEXT:    movq %rbp, -104(%rbp)
+; SJLJ-NEXT:    movq %rsp, -88(%rbp)
+; SJLJ-NEXT:    movq $.LBB0_9, -96(%rbp)
+; SJLJ-NEXT:    movl $1, -144(%rbp)
+; SJLJ-NEXT:    leaq -152(%rbp), %rdi
+; SJLJ-NEXT:    callq _Unwind_SjLj_Register@PLT
+; SJLJ-NEXT:  .Ltmp0:
+; SJLJ-NEXT:    callq _Z3foov
+; SJLJ-NEXT:  .Ltmp1:
+; SJLJ-NEXT:  # %bb.1: # %invoke.cont
+; SJLJ-NEXT:    movl $1, -44(%rbp)
+; SJLJ-NEXT:  .LBB0_7: # %return
+; SJLJ-NEXT:    movl -44(%rbp), %ebx
+; SJLJ-NEXT:    leaq -152(%rbp), %rdi
+; SJLJ-NEXT:    callq _Unwind_SjLj_Unregister@PLT
+; SJLJ-NEXT:    movl %ebx, %eax
+; SJLJ-NEXT:    addq $120, %rsp
+; SJLJ-NEXT:    popq %rbx
+; SJLJ-NEXT:    popq %r12
+; SJLJ-NEXT:    popq %r13
+; SJLJ-NEXT:    popq %r14
+; SJLJ-NEXT:    popq %r15
+; SJLJ-NEXT:    popq %rbp
+; SJLJ-NEXT:    retq
+; SJLJ-NEXT:  .LBB0_9:
+; SJLJ-NEXT:    endbr64
+; SJLJ-NEXT:    movl -144(%rbp), %eax
+; SJLJ-NEXT:    cmpl $1, %eax
+; SJLJ-NEXT:    jb .LBB0_10
+; SJLJ-NEXT:  # %bb.11:
+; SJLJ-NEXT:    ud2
+; SJLJ-NEXT:  .LBB0_10:
+; SJLJ-NEXT:    leaq .LJTI0_0(%rip), %rcx
+; SJLJ-NEXT:    jmpq *(%rcx,%rax,8)
+; SJLJ-NEXT:  .LBB0_2: # %lpad
+; SJLJ-NEXT:  .Ltmp2:
+; SJLJ-NEXT:    endbr64
+; SJLJ-NEXT:    movl -140(%rbp), %ecx
+; SJLJ-NEXT:    movl -136(%rbp), %eax
+; SJLJ-NEXT:    movq %rcx, -56(%rbp)
+; SJLJ-NEXT:    movl %eax, -64(%rbp)
+; SJLJ-NEXT:    cmpl $2, %eax
+; SJLJ-NEXT:    jne .LBB0_4
+; SJLJ-NEXT:  # %bb.3: # %catch3
+; SJLJ-NEXT:    movq -56(%rbp), %rdi
+; SJLJ-NEXT:    movl $-1, -144(%rbp)
+; SJLJ-NEXT:    callq __cxa_begin_catch
+; SJLJ-NEXT:    movl (%rax), %eax
+; SJLJ-NEXT:    movl %eax, -60(%rbp)
+; SJLJ-NEXT:    xorl %ecx, %ecx
+; SJLJ-NEXT:    cmpl $5, %eax
+; SJLJ-NEXT:    jmp .LBB0_6
+; SJLJ-NEXT:  .LBB0_4: # %catch.fallthrough
+; SJLJ-NEXT:    cmpl $1, %eax
+; SJLJ-NEXT:    jne .LBB0_8
+; SJLJ-NEXT:  # %bb.5: # %catch
+; SJLJ-NEXT:    movq -56(%rbp), %rdi
+; SJLJ-NEXT:    movl $-1, -144(%rbp)
+; SJLJ-NEXT:    callq __cxa_begin_catch
+; SJLJ-NEXT:    movzbl (%rax), %eax
+; SJLJ-NEXT:    movb %al, -45(%rbp)
+; SJLJ-NEXT:    xorl %ecx, %ecx
+; SJLJ-NEXT:    cmpb $3, %al
+; SJLJ-NEXT:  .LBB0_6: # %return
+; SJLJ-NEXT:    setne %cl
+; SJLJ-NEXT:    movl %ecx, -44(%rbp)
+; SJLJ-NEXT:    movl $-1, -144(%rbp)
+; SJLJ-NEXT:    callq __cxa_end_catch
+; SJLJ-NEXT:    jmp .LBB0_7
+; SJLJ-NEXT:  .LBB0_8: # %eh.resume
+; SJLJ-NEXT:    movl $-1, -144(%rbp)
 entry:
   %retval = alloca i32, align 4
   %exn.slot = alloca ptr
diff --git a/llvm/test/CodeGen/X86/load-local-v3i129.ll b/llvm/test/CodeGen/X86/load-local-v3i129.ll
index 8fa7ce0..eb5d172 100644
--- a/llvm/test/CodeGen/X86/load-local-v3i129.ll
+++ b/llvm/test/CodeGen/X86/load-local-v3i129.ll
@@ -12,7 +12,7 @@ define void @_start() nounwind {
 ; FAST-SHLD-NEXT:    shrq $2, %rcx
 ; FAST-SHLD-NEXT:    shldq $2, %rdx, %rcx
 ; FAST-SHLD-NEXT:    andq $-4, %rax
-; FAST-SHLD-NEXT:    orq $1, %rax
+; FAST-SHLD-NEXT:    incq %rax
 ; FAST-SHLD-NEXT:    movq %rax, -40(%rsp)
 ; FAST-SHLD-NEXT:    movq %rcx, -32(%rsp)
 ; FAST-SHLD-NEXT:    orq $-2, -56(%rsp)
@@ -23,7 +23,7 @@ define void @_start() nounwind {
 ; SLOW-SHLD:       # %bb.0: # %Entry
 ; SLOW-SHLD-NEXT:    movq -40(%rsp), %rax
 ; SLOW-SHLD-NEXT:    andq $-4, %rax
-; SLOW-SHLD-NEXT:    orq $1, %rax
+; SLOW-SHLD-NEXT:    incq %rax
 ; SLOW-SHLD-NEXT:    movq %rax, -40(%rsp)
 ; SLOW-SHLD-NEXT:    orq $-2, -56(%rsp)
 ; SLOW-SHLD-NEXT:    movq $-1, -48(%rsp)
diff --git a/llvm/test/CodeGen/X86/pr23664.ll b/llvm/test/CodeGen/X86/pr23664.ll
index 453e5db..8179602 100644
--- a/llvm/test/CodeGen/X86/pr23664.ll
+++ b/llvm/test/CodeGen/X86/pr23664.ll
@@ -6,7 +6,7 @@ define i2 @f(i32 %arg) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
 ; CHECK-NEXT:    leal (%rdi,%rdi), %eax
-; CHECK-NEXT:    orb $1, %al
+; CHECK-NEXT:    incb %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %trunc = trunc i32 %arg to i1
diff --git a/llvm/test/CodeGen/X86/vector-trunc-nowrap.ll b/llvm/test/CodeGen/X86/vector-trunc-nowrap.ll
new file mode 100644
index 0000000..32c7e82
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vector-trunc-nowrap.ll
@@ -0,0 +1,2213 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2-SSSE3,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE2-SSSE3,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
+
+define <8 x i32> @trunc8i64_8i32_nsw(<8 x i64> %a) {
+; SSE-LABEL: trunc8i64_8i32_nsw:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT:    movaps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i64_8i32_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc8i64_8i32_nsw:
+; AVX2-SLOW:       # %bb.0: # %entry
+; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_nsw:
+; AVX2-FAST-ALL:       # %bb.0: # %entry
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-ALL-NEXT:    retq
+;
+; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_nsw:
+; AVX2-FAST-PERLANE:       # %bb.0: # %entry
+; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512-LABEL: trunc8i64_8i32_nsw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nsw <8 x i64> %a to <8 x i32>
+  ret <8 x i32> %0
+}
+
+define <8 x i32> @trunc8i64_8i32_nuw(<8 x i64> %a) {
+; SSE-LABEL: trunc8i64_8i32_nuw:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT:    movaps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i64_8i32_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc8i64_8i32_nuw:
+; AVX2-SLOW:       # %bb.0: # %entry
+; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_nuw:
+; AVX2-FAST-ALL:       # %bb.0: # %entry
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-ALL-NEXT:    retq
+;
+; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_nuw:
+; AVX2-FAST-PERLANE:       # %bb.0: # %entry
+; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512-LABEL: trunc8i64_8i32_nuw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nuw <8 x i64> %a to <8 x i32>
+  ret <8 x i32> %0
+}
+
+define <8 x i16> @trunc8i64_8i16_nsw(<8 x i64> %a) {
+; SSE2-SSSE3-LABEL: trunc8i64_8i16_nsw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE2-SSSE3-NEXT:    pslld $16, %xmm2
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm2
+; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-SSSE3-NEXT:    pslld $16, %xmm0
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm0
+; SSE2-SSSE3-NEXT:    packssdw %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i64_8i16_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packusdw %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i64_8i16_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i64_8i16_nsw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc8i64_8i16_nsw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nsw <8 x i64> %a to <8 x i16>
+  ret <8 x i16> %0
+}
+
+define <8 x i16> @trunc8i64_8i16_nuw(<8 x i64> %a) {
+; SSE2-SSSE3-LABEL: trunc8i64_8i16_nuw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE2-SSSE3-NEXT:    pslld $16, %xmm2
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm2
+; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-SSSE3-NEXT:    pslld $16, %xmm0
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm0
+; SSE2-SSSE3-NEXT:    packssdw %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i64_8i16_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packusdw %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i64_8i16_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i64_8i16_nuw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc8i64_8i16_nuw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nuw <8 x i64> %a to <8 x i16>
+  ret <8 x i16> %0
+}
+
+define void @trunc8i64_8i8_nsw(<8 x i64> %a) {
+; SSE2-SSSE3-LABEL: trunc8i64_8i8_nsw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT:    movq %xmm0, (%rax)
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i64_8i8_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm4 = [255,255]
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packusdw %xmm2, %xmm0
+; SSE41-NEXT:    packuswb %xmm0, %xmm0
+; SSE41-NEXT:    movq %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i64_8i8_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255]
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i64_8i8_nsw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc8i64_8i8_nsw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovqb %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nsw <8 x i64> %a to <8 x i8>
+  store <8 x i8> %0, ptr undef, align 4
+  ret void
+}
+
+define void @trunc8i64_8i8_nuw(<8 x i64> %a) {
+; SSE2-SSSE3-LABEL: trunc8i64_8i8_nuw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT:    movq %xmm0, (%rax)
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i64_8i8_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm4 = [255,255]
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packusdw %xmm2, %xmm0
+; SSE41-NEXT:    packuswb %xmm0, %xmm0
+; SSE41-NEXT:    movq %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i64_8i8_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255]
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i64_8i8_nuw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc8i64_8i8_nuw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovqb %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nuw <8 x i64> %a to <8 x i8>
+  store <8 x i8> %0, ptr undef, align 4
+  ret void
+}
+
+define <8 x i16> @trunc8i32_8i16_nsw(<8 x i32> %a) {
+; SSE2-LABEL: trunc8i32_8i16_nsw:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc8i32_8i16_nsw:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i32_8i16_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i32_8i16_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i32_8i16_nsw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc8i32_8i16_nsw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc8i32_8i16_nsw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc8i32_8i16_nsw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc8i32_8i16_nsw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nsw <8 x i32> %a to <8 x i16>
+  ret <8 x i16> %0
+}
+
+define <8 x i16> @trunc8i32_8i16_nuw(<8 x i32> %a) {
+; SSE2-LABEL: trunc8i32_8i16_nuw:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc8i32_8i16_nuw:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i32_8i16_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i32_8i16_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i32_8i16_nuw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc8i32_8i16_nuw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc8i32_8i16_nuw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc8i32_8i16_nuw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc8i32_8i16_nuw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nuw <8 x i32> %a to <8 x i16>
+  ret <8 x i16> %0
+}
+
+define void @trunc8i32_8i8_nsw(<8 x i32> %a) {
+; SSE2-SSSE3-LABEL: trunc8i32_8i8_nsw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT:    movq %xmm0, (%rax)
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i32_8i8_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = [255,255,255,255]
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packuswb %xmm0, %xmm0
+; SSE41-NEXT:    movq %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i32_8i8_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vmovq %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i32_8i8_nsw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc8i32_8i8_nsw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc8i32_8i8_nsw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovdb %ymm0, (%rax)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc8i32_8i8_nsw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc8i32_8i8_nsw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rax)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nsw <8 x i32> %a to <8 x i8>
+  store <8 x i8> %0, ptr undef, align 4
+  ret void
+}
+
+define void @trunc8i32_8i8_nuw(<8 x i32> %a) {
+; SSE2-SSSE3-LABEL: trunc8i32_8i8_nuw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT:    movq %xmm0, (%rax)
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i32_8i8_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = [255,255,255,255]
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packuswb %xmm0, %xmm0
+; SSE41-NEXT:    movq %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i32_8i8_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vmovq %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i32_8i8_nuw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc8i32_8i8_nuw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc8i32_8i8_nuw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovdb %ymm0, (%rax)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc8i32_8i8_nuw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc8i32_8i8_nuw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rax)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nuw <8 x i32> %a to <8 x i8>
+  store <8 x i8> %0, ptr undef, align 4
+  ret void
+}
+
+define void @trunc16i32_16i16_nsw(<16 x i32> %a) {
+; SSE2-LABEL: trunc16i32_16i16_nsw:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    pslld $16, %xmm3
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    pslld $16, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    packssdw %xmm3, %xmm2
+; SSE2-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc16i32_16i16_nsw:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm4, %xmm1
+; SSSE3-NEXT:    pshufb %xmm4, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    pshufb %xmm4, %xmm3
+; SSSE3-NEXT:    pshufb %xmm4, %xmm2
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSSE3-NEXT:    movdqu %xmm2, (%rax)
+; SSSE3-NEXT:    movdqu %xmm0, (%rax)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc16i32_16i16_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    movdqu %xmm2, (%rax)
+; SSE41-NEXT:    movdqu %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc16i32_16i16_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc16i32_16i16_nsw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc16i32_16i16_nsw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nsw <16 x i32> %a to <16 x i16>
+  store <16 x i16> %0, ptr undef, align 4
+  ret void
+}
+
+define void @trunc16i32_16i16_nuw(<16 x i32> %a) {
+; SSE2-LABEL: trunc16i32_16i16_nuw:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    pslld $16, %xmm3
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    pslld $16, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    packssdw %xmm3, %xmm2
+; SSE2-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc16i32_16i16_nuw:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm4, %xmm1
+; SSSE3-NEXT:    pshufb %xmm4, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    pshufb %xmm4, %xmm3
+; SSSE3-NEXT:    pshufb %xmm4, %xmm2
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSSE3-NEXT:    movdqu %xmm2, (%rax)
+; SSSE3-NEXT:    movdqu %xmm0, (%rax)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc16i32_16i16_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    movdqu %xmm2, (%rax)
+; SSE41-NEXT:    movdqu %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc16i32_16i16_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc16i32_16i16_nuw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc16i32_16i16_nuw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nuw <16 x i32> %a to <16 x i16>
+  store <16 x i16> %0, ptr undef, align 4
+  ret void
+}
+
+define void @trunc16i32_16i8_nsw(<16 x i32> %a) {
+; SSE2-SSSE3-LABEL: trunc16i32_16i8_nsw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc16i32_16i8_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm4 = [255,255,255,255]
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packuswb %xmm2, %xmm0
+; SSE41-NEXT:    movdqu %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc16i32_16i8_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc16i32_16i8_nsw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc16i32_16i8_nsw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nuw <16 x i32> %a to <16 x i8>
+  store <16 x i8> %0, ptr undef, align 4
+  ret void
+}
+
+define void @trunc16i32_16i8_nuw(<16 x i32> %a) {
+; SSE2-SSSE3-LABEL: trunc16i32_16i8_nuw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc16i32_16i8_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm4 = [255,255,255,255]
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packuswb %xmm2, %xmm0
+; SSE41-NEXT:    movdqu %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc16i32_16i8_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc16i32_16i8_nuw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc16i32_16i8_nuw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nuw <16 x i32> %a to <16 x i8>
+  store <16 x i8> %0, ptr undef, align 4
+  ret void
+}
+
+define void @trunc16i16_16i8_nsw(<16 x i16> %a) {
+; SSE2-SSSE3-LABEL: trunc16i16_16i8_nsw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc16i16_16i8_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
+; SSE41-NEXT:    movdqu %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc16i16_16i8_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc16i16_16i8_nsw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc16i16_16i8_nsw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc16i16_16i8_nsw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc16i16_16i8_nsw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc16i16_16i8_nsw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nsw <16 x i16> %a to <16 x i8>
+  store <16 x i8> %0, ptr undef, align 4
+  ret void
+}
+
+define void @trunc16i16_16i8_nuw(<16 x i16> %a) {
+; SSE2-SSSE3-LABEL: trunc16i16_16i8_nuw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc16i16_16i8_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
+; SSE41-NEXT:    movdqu %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc16i16_16i8_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc16i16_16i8_nuw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc16i16_16i8_nuw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc16i16_16i8_nuw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc16i16_16i8_nuw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc16i16_16i8_nuw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nuw <16 x i16> %a to <16 x i8>
+  store <16 x i8> %0, ptr undef, align 4
+  ret void
+}
+
+define void @trunc32i16_32i8_nsw(<32 x i16> %a) {
+; SSE2-SSSE3-LABEL: trunc32i16_32i8_nsw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-SSSE3-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc32i16_32i8_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    packuswb %xmm3, %xmm2
+; SSE41-NEXT:    movdqu %xmm2, (%rax)
+; SSE41-NEXT:    movdqu %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc32i16_32i8_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc32i16_32i8_nsw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc32i16_32i8_nsw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm1, (%rax)
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc32i16_32i8_nsw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm1, (%rax)
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc32i16_32i8_nsw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vpmovwb %zmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc32i16_32i8_nsw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, (%rax)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nsw <32 x i16> %a to <32 x i8>
+  store <32 x i8> %0, ptr undef, align 4
+  ret void
+}
+
+define void @trunc32i16_32i8_nuw(<32 x i16> %a) {
+; SSE2-SSSE3-LABEL: trunc32i16_32i8_nuw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-SSSE3-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc32i16_32i8_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    packuswb %xmm3, %xmm2
+; SSE41-NEXT:    movdqu %xmm2, (%rax)
+; SSE41-NEXT:    movdqu %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc32i16_32i8_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc32i16_32i8_nuw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc32i16_32i8_nuw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm1, (%rax)
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc32i16_32i8_nuw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm1, (%rax)
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc32i16_32i8_nuw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vpmovwb %zmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc32i16_32i8_nuw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, (%rax)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nsw <32 x i16> %a to <32 x i8>
+  store <32 x i8> %0, ptr undef, align 4
+  ret void
+}
+
+define <8 x i32> @trunc2x4i64_8i32_nsw(<4 x i64> %a, <4 x i64> %b) {
+; SSE-LABEL: trunc2x4i64_8i32_nsw:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT:    movaps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc2x4i64_8i32_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc2x4i64_8i32_nsw:
+; AVX2-SLOW:       # %bb.0: # %entry
+; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-ALL-LABEL: trunc2x4i64_8i32_nsw:
+; AVX2-FAST-ALL:       # %bb.0: # %entry
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-ALL-NEXT:    retq
+;
+; AVX2-FAST-PERLANE-LABEL: trunc2x4i64_8i32_nsw:
+; AVX2-FAST-PERLANE:       # %bb.0: # %entry
+; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512-LABEL: trunc2x4i64_8i32_nsw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nsw <4 x i64> %a to <4 x i32>
+  %1 = trunc nsw <4 x i64> %b to <4 x i32>
+  %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @trunc2x4i64_8i32_nuw(<4 x i64> %a, <4 x i64> %b) {
+; SSE-LABEL: trunc2x4i64_8i32_nuw:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT:    movaps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc2x4i64_8i32_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc2x4i64_8i32_nuw:
+; AVX2-SLOW:       # %bb.0: # %entry
+; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-ALL-LABEL: trunc2x4i64_8i32_nuw:
+; AVX2-FAST-ALL:       # %bb.0: # %entry
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-ALL-NEXT:    retq
+;
+; AVX2-FAST-PERLANE-LABEL: trunc2x4i64_8i32_nuw:
+; AVX2-FAST-PERLANE:       # %bb.0: # %entry
+; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512-LABEL: trunc2x4i64_8i32_nuw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nuw <4 x i64> %a to <4 x i32>
+  %1 = trunc nuw <4 x i64> %b to <4 x i32>
+  %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %2
+}
+
+define <8 x i16> @trunc2x4i64_8i16_nsw(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-SSSE3-LABEL: trunc2x4i64_8i16_nsw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-SSSE3-NEXT:    pslld $16, %xmm0
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm0
+; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE2-SSSE3-NEXT:    pslld $16, %xmm2
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm2
+; SSE2-SSSE3-NEXT:    packssdw %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x4i64_8i16_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    packusdw %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc2x4i64_8i16_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc2x4i64_8i16_nsw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc2x4i64_8i16_nsw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT:    vpmovqw %zmm1, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc2x4i64_8i16_nsw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512VL-NEXT:    vpmovqw %ymm1, %xmm1
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x4i64_8i16_nsw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT:    vpmovqw %zmm1, %xmm1
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc2x4i64_8i16_nsw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vpmovqw %ymm1, %xmm1
+; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nsw <4 x i64> %a to <4 x i16>
+  %1 = trunc nsw <4 x i64> %b to <4 x i16>
+  %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc2x4i64_8i16_nuw(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-SSSE3-LABEL: trunc2x4i64_8i16_nuw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-SSSE3-NEXT:    pslld $16, %xmm0
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm0
+; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE2-SSSE3-NEXT:    pslld $16, %xmm2
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm2
+; SSE2-SSSE3-NEXT:    packssdw %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x4i64_8i16_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    packusdw %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc2x4i64_8i16_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc2x4i64_8i16_nuw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc2x4i64_8i16_nuw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT:    vpmovqw %zmm1, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc2x4i64_8i16_nuw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512VL-NEXT:    vpmovqw %ymm1, %xmm1
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x4i64_8i16_nuw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT:    vpmovqw %zmm1, %xmm1
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc2x4i64_8i16_nuw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vpmovqw %ymm1, %xmm1
+; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nuw <4 x i64> %a to <4 x i16>
+  %1 = trunc nuw <4 x i64> %b to <4 x i16>
+  %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @trunc2x2i64_4i32_nsw(<2 x i64> %a, <2 x i64> %b) {
+; SSE-LABEL: trunc2x2i64_4i32_nsw:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: trunc2x2i64_4i32_nsw:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc2x2i64_4i32_nsw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc2x2i64_4i32_nsw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x2i64_4i32_nsw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc2x2i64_4i32_nsw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nsw <2 x i64> %a to <2 x i32>
+  %1 = trunc nsw <2 x i64> %b to <2 x i32>
+  %2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @trunc2x2i64_4i32_nuw(<2 x i64> %a, <2 x i64> %b) {
+; SSE-LABEL: trunc2x2i64_4i32_nuw:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: trunc2x2i64_4i32_nuw:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc2x2i64_4i32_nuw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc2x2i64_4i32_nuw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x2i64_4i32_nuw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc2x2i64_4i32_nuw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nuw <2 x i64> %a to <2 x i32>
+  %1 = trunc nuw <2 x i64> %b to <2 x i32>
+  %2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc2x4i32_8i16_nsw(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: trunc2x4i32_8i16_nsw:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc2x4i32_8i16_nsw:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x4i32_8i16_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: trunc2x4i32_8i16_nsw:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc2x4i32_8i16_nsw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc2x4i32_8i16_nsw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x4i32_8i16_nsw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc2x4i32_8i16_nsw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nsw <4 x i32> %a to <4 x i16>
+  %1 = trunc nsw <4 x i32> %b to <4 x i16>
+  %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc2x4i32_8i16_nuw(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: trunc2x4i32_8i16_nuw:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc2x4i32_8i16_nuw:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x4i32_8i16_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: trunc2x4i32_8i16_nuw:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc2x4i32_8i16_nuw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc2x4i32_8i16_nuw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x4i32_8i16_nuw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc2x4i32_8i16_nuw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nuw <4 x i32> %a to <4 x i16>
+  %1 = trunc nuw <4 x i32> %b to <4 x i16>
+  %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %2
+}
+
+define <32 x i8> @trunc2x16i16_32i8_nsw(<16 x i16> %a, <16 x i16> %b) {
+; SSE2-SSSE3-LABEL: trunc2x16i16_32i8_nsw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm4
+; SSE2-SSSE3-NEXT:    packuswb %xmm3, %xmm4
+; SSE2-SSSE3-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x16i16_32i8_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm4
+; SSE41-NEXT:    packuswb %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc2x16i16_32i8_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc2x16i16_32i8_nsw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc2x16i16_32i8_nsw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc2x16i16_32i8_nsw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x16i16_32i8_nsw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc2x16i16_32i8_nsw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BWVL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nsw <16 x i16> %a to <16 x i8>
+  %1 = trunc nsw <16 x i16> %b to <16 x i8>
+  %2 = shufflevector <16 x i8> %0, <16 x i8> %1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i8> %2
+}
+
+define <32 x i8> @trunc2x16i16_32i8_nuw(<16 x i16> %a, <16 x i16> %b) {
+; SSE2-SSSE3-LABEL: trunc2x16i16_32i8_nuw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm4
+; SSE2-SSSE3-NEXT:    packuswb %xmm3, %xmm4
+; SSE2-SSSE3-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x16i16_32i8_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm4
+; SSE41-NEXT:    packuswb %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc2x16i16_32i8_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc2x16i16_32i8_nuw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc2x16i16_32i8_nuw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc2x16i16_32i8_nuw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x16i16_32i8_nuw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc2x16i16_32i8_nuw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BWVL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nuw <16 x i16> %a to <16 x i8>
+  %1 = trunc nuw <16 x i16> %b to <16 x i8>
+  %2 = shufflevector <16 x i8> %0, <16 x i8> %1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i8> %2
+}
+
+define <16 x i8> @trunc2x8i16_16i8_nsw(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-SSSE3-LABEL: trunc2x8i16_16i8_nsw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x8i16_16i8_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc2x8i16_16i8_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc2x8i16_16i8_nsw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc2x8i16_16i8_nsw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc2x8i16_16i8_nsw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x8i16_16i8_nsw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc2x8i16_16i8_nsw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nsw <8 x i16> %a to <8 x i8>
+  %1 = trunc nsw <8 x i16> %b to <8 x i8>
+  %2 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc2x8i16_16i8_nuw(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-SSSE3-LABEL: trunc2x8i16_16i8_nuw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x8i16_16i8_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc2x8i16_16i8_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc2x8i16_16i8_nuw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc2x8i16_16i8_nuw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc2x8i16_16i8_nuw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x8i16_16i8_nuw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc2x8i16_16i8_nuw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nuw <8 x i16> %a to <8 x i8>
+  %1 = trunc nuw <8 x i16> %b to <8 x i8>
+  %2 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %2
+}
+
+define i64 @trunc8i16_i64_nsw(<8 x i16> %inval) {
+; SSE2-LABEL: trunc8i16_i64_nsw:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc8i16_i64_nsw:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    movq %xmm0, %rax
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i16_i64_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: trunc8i16_i64_nsw:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc8i16_i64_nsw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc8i16_i64_nsw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc8i16_i64_nsw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc8i16_i64_nsw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovwb %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nsw <8 x i16> %inval to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to i64
+  ret i64 %1
+}
+
+define i64 @trunc8i16_i64_nuw(<8 x i16> %inval) {
+; SSE2-LABEL: trunc8i16_i64_nuw:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc8i16_i64_nuw:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    movq %xmm0, %rax
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i16_i64_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: trunc8i16_i64_nuw:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc8i16_i64_nuw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc8i16_i64_nuw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc8i16_i64_nuw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc8i16_i64_nuw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovwb %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nuw <8 x i16> %inval to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to i64
+  ret i64 %1
+}
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index 691ca40..f7a27a5 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -65,6 +65,7 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
 ; X64-NO-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrl %cl, %eax
@@ -74,6 +75,7 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
 ; X64-BMI2-NEXT:    movb %al, (%rdx)
@@ -81,14 +83,15 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2:       # %bb.0:
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NEXT:    movzwl (%eax), %eax
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NEXT:    movzwl (%edx), %edx
+; X86-NO-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NEXT:    movb %al, (%edx)
+; X86-NO-BMI2-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NEXT:    movb %dl, (%eax)
 ; X86-NO-BMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
@@ -97,6 +100,7 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzwl (%edx), %edx
+; X86-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-BMI2-NEXT:    shll $3, %ecx
 ; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
 ; X86-BMI2-NEXT:    movb %cl, (%eax)
@@ -119,6 +123,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
 ; X64-NO-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrl %cl, %eax
@@ -128,6 +133,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
 ; X64-BMI2-NEXT:    movw %ax, (%rdx)
@@ -139,6 +145,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NEXT:    movzwl (%edx), %edx
+; X86-NO-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NEXT:    shrl %cl, %edx
@@ -151,6 +158,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzwl (%edx), %edx
+; X86-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-BMI2-NEXT:    shll $3, %ecx
 ; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
 ; X86-BMI2-NEXT:    movw %cx, (%eax)
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out-no-ps.ll b/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out-no-ps.ll
index 8d96ab0..f75042b 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out-no-ps.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out-no-ps.ll
@@ -1,7 +1,5 @@
-; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S  \
-; RUN:   -hwasan-selective-instrumentation=0 | FileCheck %s --check-prefix=FULL
-; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S  \
-; RUN:   -hwasan-selective-instrumentation=1 | FileCheck %s --check-prefix=SELSAN
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S                                      | FileCheck %s --check-prefix=FULL
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-percentile-cutoff-hot=990000 | FileCheck %s --check-prefix=SELSAN
 
 ; FULL: @not_sanitized
 ; FULL-NEXT: %x = alloca i8, i64 4
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll b/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll
index 28e43a9..ab3f56d 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll
@@ -1,23 +1,19 @@
-; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-selective-instrumentation=1 \
-; RUN:    | FileCheck %s --check-prefix=DEFAULT
-; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-selective-instrumentation=1 \
-; RUN:    -hwasan-percentile-cutoff-hot=700000 | FileCheck %s --check-prefix=HOT_RATE
-; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-selective-instrumentation=1 \
-; RUN:    -hwasan-random-skip-rate=0.0 | FileCheck %s --check-prefix=RANDOM_RATE_0
-; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-selective-instrumentation=1 \
-; RUN:    -hwasan-random-skip-rate=1.0 | FileCheck %s --check-prefix=RANDOM_RATE_1
-
-; DEFAULT: @sanitized
-; DEFAULT-NEXT: %x = alloca i8, i64 4
-
-; HOT_RATE: @sanitized
-; HOT_RATE-NEXT: @__hwasan_tls
-
-; RANDOM_RATE_0: @sanitized
-; RANDOM_RATE_0-NEXT: @__hwasan_tls
-
-; RANDOM_RATE_1: @sanitized
-; RANDOM_RATE_1-NEXT: %x = alloca i8, i64 4
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-percentile-cutoff-hot=700000 | FileCheck %s --check-prefix=HOT70
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-percentile-cutoff-hot=990000 | FileCheck %s --check-prefix=HOT99
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-random-skip-rate=0.0 | FileCheck %s --check-prefix=RANDOM0
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-random-skip-rate=1.0 | FileCheck %s --check-prefix=RANDOM1
+
+; HOT70: @sanitized
+; HOT70-NEXT: @__hwasan_tls
+
+; HOT99: @sanitized
+; HOT99-NEXT: %x = alloca i8, i64 4
+
+; RANDOM0: @sanitized
+; RANDOM0-NEXT: @__hwasan_tls
+
+; RANDOM1: @sanitized
+; RANDOM1-NEXT: %x = alloca i8, i64 4
 
 declare void @use(ptr)
 
diff --git a/llvm/test/MC/AMDGPU/gfx1150_asm_features.s b/llvm/test/MC/AMDGPU/gfx1150_asm_features.s
index 056221f..58b7847 100644
--- a/llvm/test/MC/AMDGPU/gfx1150_asm_features.s
+++ b/llvm/test/MC/AMDGPU/gfx1150_asm_features.s
@@ -23,3 +23,24 @@ v_add3_u32_e64_dpp v5, v1, s2, s3 dpp8:[7,6,5,4,3,2,1,0]
 
 v_cmp_ne_i32_e64_dpp vcc_lo, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1150: encoding: [0x6a,0x00,0x45,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_add_f32_e64_dpp v5, v1, s2 row_mirror
+// GFX1150: encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x00,0x00,0x01,0x40,0x01,0xff]
+
+v_min3_f16 v5, v1, s2, 2.0 op_sel:[1,1,0,1] quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf
+// GFX1150: encoding: [0x05,0x58,0x49,0xd6,0xfa,0x04,0xd0,0x03,0x01,0x55,0x00,0xff]
+
+v_cmp_le_f32 vcc_lo, v1, v2 row_mirror
+// GFX1150: encoding: [0xfa,0x04,0x26,0x7c,0x01,0x40,0x01,0xff]
+
+v_cmp_le_f32 vcc_lo, v1, s2 row_mirror
+// GFX1150: encoding: [0x6a,0x00,0x13,0xd4,0xfa,0x04,0x00,0x00,0x01,0x40,0x01,0xff]
+
+v_cmp_le_f32 vcc_lo, v1, s2 quad_perm:[1,1,1,1]
+// GFX1150: encoding: [0x6a,0x00,0x13,0xd4,0xfa,0x04,0x00,0x00,0x01,0x55,0x00,0xff]
+
+v_cmpx_neq_f16 v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1150: encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_class_f16 v1, 2.0 quad_perm:[1,1,1,1]
+// GFX1150: encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x55,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
index da1989e..3ec3162 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
@@ -51,13 +51,13 @@ v_add3_u32_e64_dpp v5, v1, s1, v0 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: src1 immediate operand invalid for instruction
 
 v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: src1 immediate operand invalid for instruction
 
 v_cvt_f32_i32_e64_dpp v5, s1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -135,7 +135,7 @@ v_fmac_f16_e64_dpp v5, s2, v3 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 v_fmac_f16_e64_dpp v5, v2, 1.0 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: src1 immediate operand invalid for instruction
 
 v_fmac_f32_e64_dpp v5, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -144,7 +144,7 @@ v_fmac_f32_e64_dpp v5, 0x1234, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 v_fmac_f32_e64_dpp v5, v2, 1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: src1 immediate operand invalid for instruction
 
 v_fmac_f32_e64_dpp v5, -1.0, v3 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_features.s b/llvm/test/MC/AMDGPU/gfx12_asm_features.s
index bb911c6..7393de2 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_features.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_features.s
@@ -6,22 +6,49 @@
 //
 
 v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
-// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+// GFX12: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
 v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
-// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff]
+// GFX12: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff]
 
 v_add3_u32_e64_dpp v5, v1, s2, v0 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05]
+// GFX12: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05]
 
 v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05]
+// GFX12: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05]
 
 v_add3_u32_e64_dpp v5, v1, s2, s3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x0c,0x00,0x01,0x77,0x39,0x05]
+// GFX12: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x0c,0x00,0x01,0x77,0x39,0x05]
 
 v_cmp_ne_i32_e64_dpp vcc_lo, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1150: encoding: [0x6a,0x00,0x45,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12: encoding: [0x6a,0x00,0x45,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmp_le_f32 vcc_lo, v1, v2 row_mirror
+// GFX12: encoding: [0xfa,0x04,0x26,0x7c,0x01,0x40,0x01,0xff]
+
+v_cmp_eq_f32_e64_dpp s5, v1, s99 row_mirror
+// GFX12: encoding: [0x05,0x00,0x12,0xd4,0xfa,0xc6,0x00,0x00,0x01,0x40,0x01,0xff]
+
+v_cmp_eq_f32_e64_dpp s5, v1, s99 row_half_mirror
+// GFX12: encoding: [0x05,0x00,0x12,0xd4,0xfa,0xc6,0x00,0x00,0x01,0x41,0x01,0xff]
+
+v_cmp_eq_f32_e64_dpp s5, v1, s99 row_shl:15
+// GFX12: encoding: [0x05,0x00,0x12,0xd4,0xfa,0xc6,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_cmp_eq_f32_e64_dpp s5, v1, s99 row_shr:1
+// GFX12: encoding: [0x05,0x00,0x12,0xd4,0xfa,0xc6,0x00,0x00,0x01,0x11,0x01,0xff]
+
+v_cmp_eq_f32_e64_dpp s5, v1, s99 row_ror:1
+// GFX12: encoding: [0x05,0x00,0x12,0xd4,0xfa,0xc6,0x00,0x00,0x01,0x21,0x01,0xff]
+
+v_cmp_eq_f32_e64_dpp vcc_hi, |v1|, -s99 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0x6b,0x01,0x12,0xd4,0xfa,0xc6,0x00,0x40,0x01,0x5f,0x01,0x01]
+
+v_cmp_eq_f32_e64_dpp ttmp15, -v1, |s99| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0x7b,0x02,0x12,0xd4,0xfa,0xc6,0x00,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_gt_f32_e64_dpp v255, 4.0 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0x7e,0x00,0x94,0xd4,0xe9,0xec,0x01,0x00,0xff,0x00,0x00,0x00]
 
 //
 // Elements of CPol operand can be given in any order
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
index 88bdb7e..d0e309a 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
@@ -6,6 +6,12 @@
 v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_add3_u32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x55,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -57,6 +63,10 @@ v_add_co_u32_e64_dpp v5, s6, v1, v2 row_mirror
 // W32: [0x05,0x06,0x00,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_add_co_u32_e64_dpp v5, s6, v1, s2 row_mirror
+// W32: [0x05,0x06,0x00,0xd7,0xfa,0x04,0x00,0x00,0x01,0x40,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_add_co_u32_e64_dpp v5, s6, v1, v2 row_half_mirror
 // W32: [0x05,0x06,0x00,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -113,6 +123,10 @@ v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_half_mirror
 // W64: [0x05,0x0c,0x00,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_add_co_u32_e64_dpp v5, s[12:13], v1, s2 row_half_mirror
+// W64: [0x05,0x0c,0x00,0xd7,0xfa,0x04,0x00,0x00,0x01,0x41,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:1
 // W64: [0x05,0x0c,0x00,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -155,6 +169,12 @@ v_add_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 bank
 v_add_lshl_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x47,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_add_lshl_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x47,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_add_lshl_u32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x47,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_add_lshl_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x47,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -323,6 +343,12 @@ v_add_nc_u16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:
 v_alignbit_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_alignbit_b32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_alignbit_b32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x16,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_alignbit_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -365,6 +391,12 @@ v_alignbit_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_
 v_alignbyte_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x17,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_alignbyte_b32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x17,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_alignbyte_b32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x17,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_alignbyte_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x17,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -449,6 +481,12 @@ v_and_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound
 v_and_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x57,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_and_or_b32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x57,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_and_or_b32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x57,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_and_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x57,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -575,6 +613,12 @@ v_bcnt_u32_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0
 v_bfe_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x11,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_bfe_i32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x11,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_bfe_i32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x11,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_bfe_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x11,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -617,6 +661,12 @@ v_bfe_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:
 v_bfe_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x10,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_bfe_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x10,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_bfe_u32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x10,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_bfe_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x10,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -659,6 +709,12 @@ v_bfe_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:
 v_bfi_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x12,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_bfi_b32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x12,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_bfi_b32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x12,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_bfi_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x12,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -752,6 +808,14 @@ v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_mirror
 // W32: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cndmask_b16_e64_dpp v5, v1, s2, s3 row_mirror
+// W32: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0c,0x00,0x01,0x40,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5, v1, 10, s3 row_mirror
+// W32: [0x05,0x00,0x5d,0xd6,0xfa,0x14,0x0d,0x00,0x01,0x40,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_half_mirror
 // W32: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x41,0x01,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -808,6 +872,14 @@ v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror
 // W64: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cndmask_b16_e64_dpp v5, v1, s2, s[6:7] row_half_mirror
+// W64: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x18,0x00,0x01,0x41,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5, v1, 10, s[6:7] row_half_mirror
+// W64: [0x05,0x00,0x5d,0xd6,0xfa,0x14,0x19,0x00,0x01,0x41,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1
 // W64: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -850,6 +922,12 @@ v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 ban
 v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_cubeid_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_cubeid_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0c,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -892,6 +970,12 @@ v_cubeid_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15
 v_cubema_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_cubema_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_cubema_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0f,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_cubema_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -934,6 +1018,12 @@ v_cubema_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15
 v_cubesc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_cubesc_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_cubesc_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0d,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_cubesc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -976,6 +1066,12 @@ v_cubesc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15
 v_cubetc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_cubetc_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_cubetc_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0e,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_cubetc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -1378,6 +1474,12 @@ v_cvt_pk_u16_u32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x
 v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x26,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_cvt_pk_u8_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x26,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_cvt_pk_u8_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x26,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x26,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -1588,6 +1690,12 @@ v_cvt_pk_norm_u16_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 b
 v_div_fixup_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x54,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_div_fixup_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x54,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_div_fixup_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x54,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_div_fixup_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x54,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -1630,6 +1738,12 @@ v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 ro
 v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_fma_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_fma_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x48,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -1672,6 +1786,12 @@ v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask
 v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x13,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_fma_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x13,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_fma_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x13,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x13,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -1756,6 +1876,9 @@ v_ldexp_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 ba
 v_lerp_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x15,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_lerp_u8_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x15,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
 v_lerp_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x15,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -1798,6 +1921,12 @@ v_lerp_u8_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:
 v_lshl_add_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x46,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_lshl_add_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x46,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_lshl_add_u32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x46,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_lshl_add_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x46,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -1840,6 +1969,12 @@ v_lshl_add_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_
 v_lshl_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x56,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_lshl_or_b32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x56,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_lshl_or_b32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x56,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_lshl_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x56,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -1966,6 +2101,12 @@ v_lshrrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 b
 v_mad_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x53,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_mad_i16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x53,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_mad_i16_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x53,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_mad_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x53,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2008,6 +2149,12 @@ v_mad_i16_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank
 v_mad_i32_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x5a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_mad_i32_i16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x5a,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_mad_i32_i16_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x5a,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_mad_i32_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x5a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2050,6 +2197,12 @@ v_mad_i32_i16_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3
 v_mad_i32_i24_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_mad_i32_i24_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_mad_i32_i24_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0a,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_mad_i32_i24_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2092,6 +2245,12 @@ v_mad_i32_i24_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3
 v_mad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x41,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_mad_u16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x41,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_mad_u16_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x41,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_mad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x41,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2134,6 +2293,12 @@ v_mad_u16_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank
 v_mad_u32_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x59,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_mad_u32_u16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x59,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_mad_u32_u16_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x59,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_mad_u32_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x59,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2176,6 +2341,12 @@ v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3
 v_mad_u32_u24_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_mad_u32_u24_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_mad_u32_u24_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0b,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_mad_u32_u24_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2218,6 +2389,12 @@ v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3
 v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_max3_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_max3_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2c,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2260,6 +2437,12 @@ v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row
 v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_max3_num_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_max3_num_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2a,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2302,6 +2485,12 @@ v_max3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:
 v_max3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_max3_i16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_max3_i16_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x4d,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_max3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2344,6 +2533,12 @@ v_max3_i16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask
 v_max3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_max3_i32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_max3_i32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x1d,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_max3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2386,6 +2581,12 @@ v_max3_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask
 v_max3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x4e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_max3_u16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x4e,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_max3_u16_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x4e,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_max3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x4e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2428,6 +2629,12 @@ v_max3_u16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask
 v_max3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_max3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_max3_u32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x1e,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_max3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2554,6 +2761,12 @@ v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound
 v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_maxmin_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_maxmin_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x6b,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2596,6 +2809,12 @@ v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmas
 v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_maxmin_num_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_maxmin_num_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x69,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2638,6 +2857,12 @@ v_maxmin_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmas
 v_maxmin_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x64,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_maxmin_i32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x64,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_maxmin_i32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x64,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_maxmin_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x64,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2680,6 +2905,12 @@ v_maxmin_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_ma
 v_maxmin_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x62,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_maxmin_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x62,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_maxmin_u32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x62,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_maxmin_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x62,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2806,6 +3037,12 @@ v_mbcnt_lo_u32_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:
 v_med3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x32,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_med3_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x32,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_med3_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x32,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_med3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x32,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2848,6 +3085,12 @@ v_med3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row
 v_med3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x31,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_med3_num_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x31,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_med3_num_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x31,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_med3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x31,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2890,6 +3133,12 @@ v_med3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:
 v_med3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x50,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_med3_i16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x50,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_med3_i16_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x50,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_med3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x50,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2932,6 +3181,12 @@ v_med3_i16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask
 v_med3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x20,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_med3_i32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x20,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_med3_i32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x20,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_med3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x20,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2974,6 +3229,12 @@ v_med3_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask
 v_med3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x51,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_med3_u16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x51,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_med3_u16_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x51,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_med3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x51,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3016,6 +3277,12 @@ v_med3_u16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask
 v_med3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x21,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_med3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x21,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_med3_u32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x21,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_med3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x21,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3058,6 +3325,12 @@ v_med3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask
 v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_min3_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_min3_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2b,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3100,6 +3373,12 @@ v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row
 v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x29,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_min3_num_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x29,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_min3_num_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x29,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x29,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3142,6 +3421,12 @@ v_min3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:
 v_min3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_min3_i16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_min3_i16_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x4a,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_min3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3184,6 +3469,12 @@ v_min3_i16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask
 v_min3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_min3_i32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_min3_i32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x1a,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_min3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3226,6 +3517,12 @@ v_min3_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask
 v_min3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x4b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_min3_u16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x4b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_min3_u16_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x4b,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_min3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x4b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3268,6 +3565,12 @@ v_min3_u16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask
 v_min3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_min3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_min3_u32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x1b,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_min3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3394,6 +3697,12 @@ v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound
 v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_minmax_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_minmax_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x6a,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3436,6 +3745,12 @@ v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmas
 v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_minmax_num_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_minmax_num_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x68,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3478,6 +3793,12 @@ v_minmax_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmas
 v_minmax_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x65,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_minmax_i32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x65,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_minmax_i32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x65,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_minmax_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x65,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3520,6 +3841,12 @@ v_minmax_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_ma
 v_minmax_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x63,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_minmax_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x63,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_minmax_u32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x63,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_minmax_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x63,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3562,6 +3889,9 @@ v_minmax_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_ma
 v_msad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x39,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_msad_u8_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x39,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
 v_msad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x39,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3646,6 +3976,12 @@ v_mul_lo_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bo
 v_mullit_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_mullit_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x18,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_mullit_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x18,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_mullit_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3688,6 +4024,12 @@ v_mullit_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15
 v_or3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x58,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_or3_b32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x58,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_or3_b32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x58,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_or3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x58,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3814,6 +4156,12 @@ v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mas
 v_perm_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x44,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_perm_b32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x44,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_perm_b32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x44,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_perm_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x44,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3856,6 +4204,9 @@ v_perm_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask
 v_sad_hi_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x23,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_sad_hi_u8_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x23,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
 v_sad_hi_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x23,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3898,6 +4249,12 @@ v_sad_hi_u8_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 ba
 v_sad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x24,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_sad_u16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x24,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_sad_u16_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x24,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_sad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x24,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3940,6 +4297,12 @@ v_sad_u16_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank
 v_sad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x25,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_sad_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x25,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_sad_u32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x25,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_sad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x25,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3982,6 +4345,9 @@ v_sad_u32_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank
 v_sad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x22,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_sad_u8_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x22,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
 v_sad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x22,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -4033,6 +4399,10 @@ v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_mirror
 // W32: [0x05,0x06,0x01,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_sub_co_u32_e64_dpp v5, s6, v1, s2 row_mirror
+// W32: [0x05,0x06,0x01,0xd7,0xfa,0x04,0x00,0x00,0x01,0x40,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_half_mirror
 // W32: [0x05,0x06,0x01,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4089,6 +4459,10 @@ v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_half_mirror
 // W64: [0x05,0x0c,0x01,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_sub_co_u32_e64_dpp v5, s[12:13], v1, s2 row_half_mirror
+// W64: [0x05,0x0c,0x01,0xd7,0xfa,0x04,0x00,0x00,0x01,0x41,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:1
 // W64: [0x05,0x0c,0x01,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4266,6 +4640,10 @@ v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_mirror
 // W32: [0x05,0x06,0x02,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_subrev_co_u32_e64_dpp v5, s6, v1, s2 row_mirror
+// W32: [0x05,0x06,0x02,0xd7,0xfa,0x04,0x00,0x00,0x01,0x40,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_half_mirror
 // W32: [0x05,0x06,0x02,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4322,6 +4700,10 @@ v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_half_mirror
 // W64: [0x05,0x0c,0x02,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_subrev_co_u32_e64_dpp v5, s[12:13], v1, s2 row_half_mirror
+// W64: [0x05,0x0c,0x02,0xd7,0xfa,0x04,0x00,0x00,0x01,0x41,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:1
 // W64: [0x05,0x0c,0x02,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4364,6 +4746,12 @@ v_subrev_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 b
 v_xad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x45,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_xad_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x45,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_xad_u32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x45,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_xad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x45,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -4406,6 +4794,12 @@ v_xad_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:
 v_xor3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x40,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_xor3_b32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x40,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_xor3_b32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x40,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_xor3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x40,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -4770,7 +5164,7 @@ v_dot2_f16_f16_e64_dpp v0, s1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask
 // GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 v_dot2_f16_f16_e64_dpp v0, v1, s2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0c,0x04,0x01,0xe4,0x04,0x00]
 
 v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
 // GFX12: [0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
@@ -4791,7 +5185,7 @@ v_dot2_bf16_bf16_e64_dpp v0, s1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_ma
 // GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 v_dot2_bf16_bf16_e64_dpp v0, v1, s2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12: [0x00,0x00,0x67,0xd6,0xfa,0x04,0x0c,0x04,0x01,0xe4,0x00,0x00]
 
 v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
 // GFX12: [0x00,0x60,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
@@ -4973,6 +5367,12 @@ v_maximum_f16 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bou
 v_minimum3_f32 v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_minimum3_f32 v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_minimum3_f32 v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2d,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_minimum3_f32 v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -5015,6 +5415,12 @@ v_minimum3_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_m
 v_maximum3_f32 v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_maximum3_f32 v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_maximum3_f32 v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2e,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_maximum3_f32 v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -5057,6 +5463,12 @@ v_maximum3_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_m
 v_minimum3_f16 v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_minimum3_f16 v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_minimum3_f16 v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2f,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_minimum3_f16 v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -5099,6 +5511,12 @@ v_minimum3_f16 v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x
 v_maximum3_f16 v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_maximum3_f16 v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_maximum3_f16 v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x30,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_maximum3_f16 v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -5180,6 +5598,12 @@ v_maximumminimum_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15
 v_minimummaximum_f32 v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_minimummaximum_f32 v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_minimummaximum_f32 v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x6c,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_minimummaximum_f32 v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -5222,6 +5646,12 @@ v_minimummaximum_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15
 v_maximumminimum_f16 v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_maximumminimum_f16 v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_maximumminimum_f16 v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x6f,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_maximumminimum_f16 v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -5264,6 +5694,12 @@ v_maximumminimum_f16 v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_m
 v_minimummaximum_f16 v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_minimummaximum_f16 v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_minimummaximum_f16 v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x6e,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_minimummaximum_f16 v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
index 0e84765..25b13ac 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
@@ -6,6 +6,12 @@
 v_add3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_add3_u32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_add3_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x55,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_add3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x55,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -47,6 +53,10 @@ v_add_co_u32_e64_dpp v5, s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x69,0x00,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_add_co_u32_e64_dpp v5, s105, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x69,0x00,0xd7,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_add_co_u32_e64_dpp v5, vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x6a,0x00,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -67,6 +77,10 @@ v_add_co_u32_e64_dpp v5, s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x05,0x68,0x00,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_add_co_u32_e64_dpp v5, s[104:105], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x05,0x68,0x00,0xd7,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_add_co_u32_e64_dpp v5, vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x05,0x6a,0x00,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -81,6 +95,12 @@ v_add_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_add_lshl_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x47,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_add_lshl_u32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x47,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_add_lshl_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x47,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_add_lshl_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x47,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -144,6 +164,12 @@ v_add_nc_u16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_alignbit_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x16,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_alignbit_b32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x16,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_alignbit_b32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x16,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_alignbit_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x16,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -177,6 +203,12 @@ v_alignbit_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_alignbyte_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x17,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_alignbyte_b32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x17,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_alignbyte_b32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x17,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_alignbyte_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x17,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -219,6 +251,12 @@ v_and_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_and_or_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x57,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_and_or_b32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x57,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_and_or_b32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x57,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_and_or_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x57,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -273,6 +311,12 @@ v_bcnt_u32_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_bfe_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x11,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_bfe_i32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x11,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_bfe_i32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x11,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_bfe_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x11,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -309,6 +353,12 @@ v_bfe_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_bfe_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x10,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_bfe_u32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x10,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_bfe_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x10,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_bfe_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x10,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -345,6 +395,12 @@ v_bfe_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_bfi_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x12,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_bfi_b32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x12,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_bfi_b32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x12,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_bfi_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x12,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -391,6 +447,14 @@ v_cndmask_b16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cndmask_b16_e64_dpp v5, v1, s2, s3 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0c,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5, v1, 10, s3 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x5d,0xd6,0xe9,0x14,0x0d,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cndmask_b16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -423,12 +487,22 @@ v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1
 // W64: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cndmask_b16_e64_dpp v5, -v1, |s2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1
+// W64: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xe8,0x21,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0xff,0x03,0x5d,0xd6,0xe9,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
 
 v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_cubeid_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_cubeid_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0c,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_cubeid_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -465,6 +539,12 @@ v_cubeid_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,
 v_cubema_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_cubema_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0f,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_cubema_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0f,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_cubema_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -501,6 +581,12 @@ v_cubema_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,
 v_cubesc_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_cubesc_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0d,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_cubesc_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0d,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_cubesc_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -537,6 +623,12 @@ v_cubesc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,
 v_cubetc_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_cubetc_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0e,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_cubetc_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0e,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_cubetc_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -687,6 +779,12 @@ v_cvt_pk_u16_u32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x26,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_cvt_pk_u8_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x26,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_cvt_pk_u8_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x26,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x26,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -771,6 +869,12 @@ v_cvt_pk_norm_u16_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_div_fixup_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x54,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_div_fixup_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x54,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_div_fixup_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x54,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_div_fixup_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x54,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -807,6 +911,12 @@ v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0
 v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_fma_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x48,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -843,6 +953,12 @@ v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0
 v_fma_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x13,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_fma_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x13,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_fma_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x13,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_fma_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x13,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -891,6 +1007,9 @@ v_ldexp_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_lerp_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x15,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_lerp_u8_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x15,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
 v_lerp_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x15,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -927,6 +1046,12 @@ v_lerp_u8_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_lshl_add_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x46,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_lshl_add_u32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x46,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_lshl_add_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x46,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_lshl_add_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x46,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -963,6 +1088,12 @@ v_lshl_add_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_lshl_or_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x56,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_lshl_or_b32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x56,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_lshl_or_b32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x56,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_lshl_or_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x56,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1017,6 +1148,12 @@ v_lshrrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_mad_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x53,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_mad_i16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x53,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_mad_i16_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x53,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_mad_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x53,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1050,6 +1187,12 @@ v_mad_i16_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_mad_i32_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x5a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_mad_i32_i16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x5a,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_mad_i32_i16_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x5a,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_mad_i32_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x5a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1086,6 +1229,12 @@ v_mad_i32_i16_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:
 v_mad_i32_i24_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_mad_i32_i24_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_mad_i32_i24_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0a,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_mad_i32_i24_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1122,6 +1271,12 @@ v_mad_i32_i24_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:
 v_mad_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x41,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_mad_u16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x41,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_mad_u16_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x41,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_mad_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x41,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1155,6 +1310,12 @@ v_mad_u16_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_mad_u32_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x59,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_mad_u32_u16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x59,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_mad_u32_u16_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x59,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_mad_u32_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x59,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1191,6 +1352,12 @@ v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:
 v_mad_u32_u24_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_mad_u32_u24_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_mad_u32_u24_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0b,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_mad_u32_u24_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1227,6 +1394,12 @@ v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:
 v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_max3_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_max3_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2c,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1263,6 +1436,12 @@ v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,
 v_max3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_max3_num_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_max3_num_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2a,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_max3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1299,6 +1478,12 @@ v_max3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,
 v_max3_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x4d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_max3_i16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x4d,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_max3_i16_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x4d,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_max3_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x4d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1332,6 +1517,12 @@ v_max3_i16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_max3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_max3_i32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_max3_i32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1d,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_max3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1368,6 +1559,12 @@ v_max3_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_max3_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x4e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_max3_u16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x4e,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_max3_u16_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x4e,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_max3_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x4e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1401,6 +1598,12 @@ v_max3_u16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_max3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_max3_u32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_max3_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1e,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_max3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1455,6 +1658,12 @@ v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_maxmin_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6b,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1491,6 +1700,12 @@ v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,
 v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_maxmin_num_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x69,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x69,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_maxmin_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x69,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1527,6 +1742,12 @@ v_maxmin_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,
 v_maxmin_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x64,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_maxmin_i32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x64,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_maxmin_i32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x64,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_maxmin_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x64,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1563,6 +1784,12 @@ v_maxmin_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_maxmin_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x62,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_maxmin_u32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x62,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_maxmin_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x62,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_maxmin_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x62,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1617,6 +1844,12 @@ v_mbcnt_lo_u32_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_med3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x32,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_med3_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x32,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_med3_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x32,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_med3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x32,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1653,6 +1886,12 @@ v_med3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,
 v_med3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x31,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_med3_num_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x31,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_med3_num_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x31,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_med3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x31,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1689,6 +1928,12 @@ v_med3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,
 v_med3_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x50,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_med3_i16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x50,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_med3_i16_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x50,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_med3_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x50,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1722,6 +1967,12 @@ v_med3_i16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_med3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x20,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_med3_i32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x20,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_med3_i32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x20,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_med3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x20,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1758,6 +2009,12 @@ v_med3_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_med3_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x51,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_med3_u16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x51,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_med3_u16_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x51,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_med3_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x51,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1791,6 +2048,12 @@ v_med3_u16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_med3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x21,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_med3_u32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x21,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_med3_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x21,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_med3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x21,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1827,6 +2090,12 @@ v_med3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_min3_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_min3_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2b,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1863,6 +2132,12 @@ v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,
 v_min3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x29,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_min3_num_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x29,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_min3_num_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x29,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_min3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x29,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1899,6 +2174,12 @@ v_min3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,
 v_min3_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x4a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_min3_i16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x4a,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_min3_i16_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x4a,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_min3_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x4a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1932,6 +2213,12 @@ v_min3_i16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_min3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_min3_i32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_min3_i32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1a,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_min3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1968,6 +2255,12 @@ v_min3_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_min3_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x4b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_min3_u16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x4b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_min3_u16_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x4b,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_min3_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x4b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2001,6 +2294,12 @@ v_min3_u16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_min3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_min3_u32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_min3_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1b,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_min3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2055,6 +2354,12 @@ v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_minmax_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6a,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2091,6 +2396,12 @@ v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,
 v_minmax_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x68,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_minmax_num_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x68,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x68,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_minmax_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x68,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2127,6 +2438,12 @@ v_minmax_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,
 v_minmax_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x65,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_minmax_i32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x65,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_minmax_i32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x65,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_minmax_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x65,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2163,6 +2480,12 @@ v_minmax_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_minmax_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x63,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_minmax_u32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x63,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_minmax_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x63,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_minmax_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x63,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2199,6 +2522,9 @@ v_minmax_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_msad_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x39,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_msad_u8_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x39,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
 v_msad_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x39,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2244,6 +2570,12 @@ v_mul_lo_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_mullit_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x18,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_mullit_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x18,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_mullit_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x18,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_mullit_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x18,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2280,6 +2612,12 @@ v_mullit_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,
 v_or3_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x58,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_or3_b32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x58,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_or3_b32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x58,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_or3_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x58,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2337,6 +2675,12 @@ v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_perm_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x44,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_perm_b32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x44,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_perm_b32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x44,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_perm_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x44,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2373,6 +2717,9 @@ v_perm_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_sad_hi_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x23,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_sad_hi_u8_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x23,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
 v_sad_hi_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x23,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2409,6 +2756,12 @@ v_sad_hi_u8_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_sad_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x24,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_sad_u16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x24,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_sad_u16_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x24,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_sad_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x24,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2445,6 +2798,12 @@ v_sad_u16_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_sad_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x25,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_sad_u32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x25,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_sad_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x25,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_sad_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x25,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2481,6 +2840,9 @@ v_sad_u32_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_sad_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x22,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_sad_u8_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x22,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
 v_sad_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x22,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2518,6 +2880,10 @@ v_sub_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x06,0x01,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_sub_co_u32_e64_dpp v5, s6, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x06,0x01,0xd7,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_sub_co_u32_e64_dpp v5, s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x69,0x01,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2538,6 +2904,10 @@ v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x05,0x0c,0x01,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_sub_co_u32_e64_dpp v5, s[12:13], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x05,0x0c,0x01,0xd7,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_sub_co_u32_e64_dpp v5, s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x05,0x68,0x01,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2584,6 +2954,10 @@ v_subrev_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x06,0x02,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_subrev_co_u32_e64_dpp v5, s6, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x06,0x02,0xd7,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_subrev_co_u32_e64_dpp v5, s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x69,0x02,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2608,6 +2982,10 @@ v_subrev_co_u32_e64_dpp v5, s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x05,0x68,0x02,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_subrev_co_u32_e64_dpp v5, s[104:105], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x05,0x68,0x02,0xd7,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_subrev_co_u32_e64_dpp v5, vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x05,0x6a,0x02,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2622,6 +3000,12 @@ v_subrev_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_xad_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x45,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_xad_u32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x45,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_xad_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x45,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_xad_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x45,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2658,6 +3042,12 @@ v_xad_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_xor3_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x40,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_xor3_b32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x40,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_xor3_b32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x40,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_xor3_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x40,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2983,7 +3373,7 @@ v_dot2_f16_f16_e64_dpp v0, s1, v2, v3 dpp8:[0,1,2,3,4,4,4,4]
 // GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 v_dot2_f16_f16_e64_dpp v0, v1, s2, v3 dpp8:[0,1,2,3,4,4,4,4]
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x88,0x46,0x92]
 
 v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4]
 // GFX12: [0x00,0x60,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
@@ -3004,7 +3394,7 @@ v_dot2_bf16_bf16_e64_dpp v0, s1, v2, v3 dpp8:[0,1,2,3,4,4,4,4]
 // GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 v_dot2_bf16_bf16_e64_dpp v0, v1, s2, v3 dpp8:[0,1,2,3,4,4,4,4]
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12: [0x00,0x00,0x67,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x88,0x46,0x92]
 
 v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4]
 // GFX12: [0x00,0x60,0x67,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
@@ -3066,6 +3456,12 @@ v_maximum_f16 v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_minimum3_f32 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_minimum3_f32 v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2d,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_minimum3_f32 v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2d,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_minimum3_f32 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -3102,6 +3498,12 @@ v_minimum3_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,
 v_maximum3_f32 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_maximum3_f32 v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2e,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_maximum3_f32 v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2e,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_maximum3_f32 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -3138,6 +3540,12 @@ v_maximum3_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,
 v_minimum3_f16 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_minimum3_f16 v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_minimum3_f16 v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2f,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_minimum3_f16 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -3174,6 +3582,12 @@ v_minimum3_f16 v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] f
 v_maximum3_f16 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x30,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_maximum3_f16 v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x30,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_maximum3_f16 v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x30,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_maximum3_f16 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -3210,6 +3624,12 @@ v_maximum3_f16 v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] f
 v_maximumminimum_f32 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_maximumminimum_f32 v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6d,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_maximumminimum_f32 v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6d,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_maximumminimum_f32 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -3246,6 +3666,12 @@ v_maximumminimum_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,
 v_minimummaximum_f32 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_minimummaximum_f32 v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6c,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_minimummaximum_f32 v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6c,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_minimummaximum_f32 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -3282,6 +3708,12 @@ v_minimummaximum_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,
 v_maximumminimum_f16 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_maximumminimum_f16 v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_maximumminimum_f16 v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6f,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_maximumminimum_f16 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -3318,6 +3750,12 @@ v_maximumminimum_f16 v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,
 v_minimummaximum_f16 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_minimummaximum_f16 v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_minimummaximum_f16 v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6e,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_minimummaximum_f16 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp16.s
index ab88ec8..2b7830c 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp16.s
@@ -128,6 +128,12 @@ v_add_f16_e64_dpp v5, v1, v2 row_shl:1
 v_add_f16_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x32,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_add_f16_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x32,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_add_f16_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x32,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_add_f16_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x32,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -170,6 +176,12 @@ v_add_f32_e64_dpp v5, v1, v2 row_shl:1
 v_add_f32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_add_f32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_add_f32_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x03,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_add_f32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -212,6 +224,12 @@ v_add_nc_u32_e64_dpp v5, v1, v2 row_shl:1
 v_add_nc_u32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x25,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_add_nc_u32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x25,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_add_nc_u32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x25,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_add_nc_u32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x25,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -254,6 +272,12 @@ v_and_b32_e64_dpp v5, v1, v2 row_shl:1
 v_and_b32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x1b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_and_b32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x1b,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_and_b32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x1b,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_and_b32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x1b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -296,6 +320,12 @@ v_ashrrev_i32_e64_dpp v5, v1, v2 row_shl:1
 v_ashrrev_i32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x1a,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_ashrrev_i32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x1a,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_ashrrev_i32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x1a,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_ashrrev_i32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x1a,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -445,6 +475,12 @@ v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, v2 row_shl:1
 v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x2f,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x2f,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x2f,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x2f,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -487,6 +523,12 @@ v_cvt_pkrtz_f16_f32_e64_dpp v5, v1, v2 row_shl:1
 v_cvt_pkrtz_f16_f32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x2f,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_cvt_pkrtz_f16_f32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x2f,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_cvt_pkrtz_f16_f32_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x2f,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_cvt_pkrtz_f16_f32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x2f,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -529,6 +571,12 @@ v_ldexp_f16_e64_dpp v5, v1, v2 row_shl:1
 v_ldexp_f16_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_ldexp_f16_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x3b,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_ldexp_f16_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x3b,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_ldexp_f16_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -571,6 +619,12 @@ v_lshlrev_b32_e64_dpp v5, v1, v2 row_shl:1
 v_lshlrev_b32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x18,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_lshlrev_b32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x18,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_lshlrev_b32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x18,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_lshlrev_b32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x18,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -613,6 +667,12 @@ v_lshrrev_b32_e64_dpp v5, v1, v2 row_shl:1
 v_lshrrev_b32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x19,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_lshrrev_b32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x19,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_lshrrev_b32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x19,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_lshrrev_b32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x19,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -655,6 +715,12 @@ v_max_num_f16_e64_dpp v5, v1, v2 row_shl:1
 v_max_num_f16_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x31,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_max_num_f16_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x31,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_max_num_f16_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x31,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_max_num_f16_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x31,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -697,6 +763,12 @@ v_max_num_f32_e64_dpp v5, v1, v2 row_shl:1
 v_max_num_f32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x16,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_max_num_f32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x16,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_max_num_f32_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x16,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_max_num_f32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x16,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -739,6 +811,12 @@ v_max_i32_e64_dpp v5, v1, v2 row_shl:1
 v_max_i32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x12,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_max_i32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x12,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_max_i32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x12,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_max_i32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x12,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -781,6 +859,12 @@ v_max_u32_e64_dpp v5, v1, v2 row_shl:1
 v_max_u32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x14,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_max_u32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x14,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_max_u32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x14,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_max_u32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x14,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -823,6 +907,12 @@ v_min_num_f16_e64_dpp v5, v1, v2 row_shl:1
 v_min_num_f16_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x30,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_min_num_f16_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x30,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_min_num_f16_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x30,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_min_num_f16_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x30,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -865,6 +955,12 @@ v_min_num_f32_e64_dpp v5, v1, v2 row_shl:1
 v_min_num_f32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x15,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_min_num_f32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x15,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_min_num_f32_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x15,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_min_num_f32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x15,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -907,6 +1003,12 @@ v_min_i32_e64_dpp v5, v1, v2 row_shl:1
 v_min_i32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x11,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_min_i32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x11,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_min_i32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x11,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_min_i32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x11,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -949,6 +1051,12 @@ v_min_u32_e64_dpp v5, v1, v2 row_shl:1
 v_min_u32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x13,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_min_u32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x13,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_min_u32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x13,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_min_u32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x13,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -991,6 +1099,12 @@ v_mul_dx9_zero_f32_e64_dpp v5, v1, v2 row_shl:1
 v_mul_dx9_zero_f32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x07,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_mul_dx9_zero_f32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x07,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_mul_dx9_zero_f32_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x07,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_mul_dx9_zero_f32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x07,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1033,6 +1147,12 @@ v_mul_f16_e64_dpp v5, v1, v2 row_shl:1
 v_mul_f16_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x35,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_mul_f16_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x35,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_mul_f16_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x35,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_mul_f16_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x35,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1075,6 +1195,12 @@ v_mul_f32_e64_dpp v5, v1, v2 row_shl:1
 v_mul_f32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x08,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_mul_f32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x08,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_mul_f32_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x08,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_mul_f32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x08,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1117,6 +1243,12 @@ v_mul_hi_i32_i24_e64_dpp v5, v1, v2 row_shl:1
 v_mul_hi_i32_i24_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x0a,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_mul_hi_i32_i24_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x0a,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_mul_hi_i32_i24_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x0a,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_mul_hi_i32_i24_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x0a,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1159,6 +1291,12 @@ v_mul_hi_u32_u24_e64_dpp v5, v1, v2 row_shl:1
 v_mul_hi_u32_u24_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x0c,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_mul_hi_u32_u24_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x0c,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_mul_hi_u32_u24_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x0c,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_mul_hi_u32_u24_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x0c,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1201,6 +1339,12 @@ v_mul_i32_i24_e64_dpp v5, v1, v2 row_shl:1
 v_mul_i32_i24_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x09,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_mul_i32_i24_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x09,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_mul_i32_i24_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x09,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_mul_i32_i24_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x09,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1243,6 +1387,12 @@ v_mul_legacy_f32_e64_dpp v5, v1, v2 row_shl:1
 v_mul_legacy_f32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x07,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_mul_legacy_f32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x07,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_mul_legacy_f32_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x07,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_mul_legacy_f32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x07,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1285,6 +1435,12 @@ v_mul_u32_u24_e64_dpp v5, v1, v2 row_shl:1
 v_mul_u32_u24_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x0b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_mul_u32_u24_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x0b,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_mul_u32_u24_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x0b,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_mul_u32_u24_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x0b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1327,6 +1483,12 @@ v_or_b32_e64_dpp v5, v1, v2 row_shl:1
 v_or_b32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x1c,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_or_b32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x1c,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_or_b32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x1c,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_or_b32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x1c,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1476,6 +1638,12 @@ v_sub_f16_e64_dpp v5, v1, v2 row_shl:1
 v_sub_f16_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x33,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_sub_f16_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x33,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_sub_f16_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x33,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_sub_f16_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x33,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1518,6 +1686,12 @@ v_sub_f32_e64_dpp v5, v1, v2 row_shl:1
 v_sub_f32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x04,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_sub_f32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x04,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_sub_f32_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x04,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_sub_f32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x04,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1560,6 +1734,12 @@ v_sub_nc_u32_e64_dpp v5, v1, v2 row_shl:1
 v_sub_nc_u32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x26,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_sub_nc_u32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x26,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_sub_nc_u32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x26,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_sub_nc_u32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x26,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1709,6 +1889,12 @@ v_subrev_f16_e64_dpp v5, v1, v2 row_shl:1
 v_subrev_f16_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x34,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_subrev_f16_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x34,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_subrev_f16_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x34,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_subrev_f16_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x34,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1751,6 +1937,12 @@ v_subrev_f32_e64_dpp v5, v1, v2 row_shl:1
 v_subrev_f32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x05,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_subrev_f32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x05,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_subrev_f32_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x05,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_subrev_f32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x05,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1793,6 +1985,12 @@ v_subrev_nc_u32_e64_dpp v5, v1, v2 row_shl:1
 v_subrev_nc_u32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x27,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_subrev_nc_u32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x27,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_subrev_nc_u32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x27,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_subrev_nc_u32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x27,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1835,6 +2033,12 @@ v_xnor_b32_e64_dpp v5, v1, v2 row_shl:1
 v_xnor_b32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x1e,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_xnor_b32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x1e,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_xnor_b32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x1e,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_xnor_b32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x1e,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1877,6 +2081,12 @@ v_xor_b32_e64_dpp v5, v1, v2 row_shl:1
 v_xor_b32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x1d,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_xor_b32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x1d,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_xor_b32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x1d,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_xor_b32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x1d,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp8.s
index dc151d66..b18029d 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp8.s
@@ -45,6 +45,12 @@ v_add_co_ci_u32_e64_dpp v255, null, v255, v255, null clamp dpp8:[0,0,0,0,0,0,0,0
 v_add_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x32,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_add_f16_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x32,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_add_f16_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x32,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_add_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x32,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -57,6 +63,12 @@ v_add_f16_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_add_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x03,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_add_f32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x03,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_add_f32_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x03,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_add_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x03,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -69,6 +81,12 @@ v_add_f32_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_add_nc_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x25,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_add_nc_u32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x25,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x25,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_add_nc_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x25,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -78,6 +96,12 @@ v_add_nc_u32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_and_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1b,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_and_b32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1b,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_and_b32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1b,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_and_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x1b,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -87,6 +111,12 @@ v_and_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_ashrrev_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1a,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_ashrrev_i32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1a,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_ashrrev_i32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1a,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_ashrrev_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x1a,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -97,14 +127,30 @@ v_cndmask_b32_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x01,0xd5,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cndmask_b32_e64_dpp v5, v1, s2, s3 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x01,0xd5,0xe9,0x04,0x0c,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cndmask_b32_e64_dpp v5, v1, 10, s3 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x01,0xd5,0xe9,0x14,0x0d,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cndmask_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x01,0xd5,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cndmask_b32_e64_dpp v5, v1, s2, s105 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x01,0xd5,0xe9,0x04,0xa4,0x01,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cndmask_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x01,0xd5,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cndmask_b32_e64_dpp v5, v1, s2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x01,0xd5,0xe9,0x04,0xac,0x01,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cndmask_b32_e64_dpp v5, |v1|, -v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x01,0x01,0xd5,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -117,10 +163,22 @@ v_cndmask_b32_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x05,0x00,0x01,0xd5,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cndmask_b32_e64_dpp v5, v1, s2, s[6:7] dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x05,0x00,0x01,0xd5,0xe9,0x04,0x18,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cndmask_b32_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x05,0x00,0x01,0xd5,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cndmask_b32_e64_dpp v5, v1, s2, s[104:105] dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x05,0x00,0x01,0xd5,0xe9,0x04,0xa0,0x01,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cndmask_b32_e64_dpp v5, v1, 10, s[104:105] dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x05,0x00,0x01,0xd5,0xe9,0x14,0xa1,0x01,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cndmask_b32_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x05,0x01,0x01,0xd5,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -135,6 +193,12 @@ v_cndmask_b32_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2f,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2f,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2f,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cvt_pk_rtz_f16_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x2f,0xd5,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
@@ -147,6 +211,12 @@ v_cvt_pk_rtz_f16_f32_e64_dpp v255, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0]
 v_cvt_pkrtz_f16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2f,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cvt_pkrtz_f16_f32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2f,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cvt_pkrtz_f16_f32_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2f,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cvt_pkrtz_f16_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x2f,0xd5,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
@@ -159,9 +229,18 @@ v_cvt_pkrtz_f16_f32_e64_dpp v255, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0]
 v_ldexp_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x3b,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_ldexp_f16_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x3b,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
 v_ldexp_f16_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x3b,0xd5,0xe9,0x04,0x02,0x08,0x01,0x77,0x39,0x05]
 
+v_ldexp_f16_e64_dpp v5, v1, s2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x3b,0xd5,0xe9,0x04,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_ldexp_f16_e64_dpp v5, v1, 2.0 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x3b,0xd5,0xe9,0xe8,0x01,0x08,0x01,0x77,0x39,0x05]
+
 v_ldexp_f16_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x3b,0xd5,0xea,0x04,0x02,0x10,0x01,0x77,0x39,0x05]
 
@@ -171,6 +250,12 @@ v_ldexp_f16_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_lshlrev_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x18,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_lshlrev_b32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x18,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_lshlrev_b32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x18,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_lshlrev_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x18,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -180,6 +265,12 @@ v_lshlrev_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_lshrrev_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x19,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_lshrrev_b32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x19,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_lshrrev_b32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x19,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_lshrrev_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x19,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -189,6 +280,12 @@ v_lshrrev_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_max_num_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x31,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_max_num_f16_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x31,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_max_num_f16_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x31,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_max_num_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x31,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -201,6 +298,12 @@ v_max_num_f16_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
 v_max_num_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x16,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_max_num_f32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x16,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_max_num_f32_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x16,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_max_num_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x16,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -213,6 +316,12 @@ v_max_num_f32_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
 v_max_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x12,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_max_i32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x12,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_max_i32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x12,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_max_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x12,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -222,6 +331,12 @@ v_max_i32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_max_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x14,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_max_u32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x14,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_max_u32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x14,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_max_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x14,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -231,6 +346,12 @@ v_max_u32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_min_num_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x30,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_min_num_f16_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x30,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_min_num_f16_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x30,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_min_num_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x30,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -243,6 +364,12 @@ v_min_num_f16_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
 v_min_num_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x15,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_min_num_f32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x15,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_min_num_f32_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x15,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_min_num_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x15,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -255,6 +382,12 @@ v_min_num_f32_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
 v_min_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x11,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_min_i32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x11,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_min_i32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x11,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_min_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x11,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -264,6 +397,12 @@ v_min_i32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_min_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x13,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_min_u32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x13,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_min_u32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x13,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_min_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x13,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -273,6 +412,12 @@ v_min_u32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_mul_dx9_zero_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x07,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_mul_dx9_zero_f32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x07,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_mul_dx9_zero_f32_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x07,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_mul_dx9_zero_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x07,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -285,6 +430,12 @@ v_mul_dx9_zero_f32_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,
 v_mul_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x35,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_mul_f16_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x35,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_mul_f16_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x35,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_mul_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x35,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -297,6 +448,12 @@ v_mul_f16_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_mul_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x08,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_mul_f32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x08,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_mul_f32_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x08,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_mul_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x08,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -309,6 +466,12 @@ v_mul_f32_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_mul_hi_i32_i24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0a,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_mul_hi_i32_i24_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0a,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_mul_hi_i32_i24_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0a,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_mul_hi_i32_i24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x0a,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -318,6 +481,12 @@ v_mul_hi_i32_i24_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_mul_hi_u32_u24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0c,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_mul_hi_u32_u24_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0c,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_mul_hi_u32_u24_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0c,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_mul_hi_u32_u24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x0c,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -327,6 +496,12 @@ v_mul_hi_u32_u24_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_mul_i32_i24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x09,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_mul_i32_i24_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x09,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_mul_i32_i24_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x09,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_mul_i32_i24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x09,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -336,6 +511,12 @@ v_mul_i32_i24_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_mul_legacy_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x07,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_mul_legacy_f32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x07,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_mul_legacy_f32_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x07,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_mul_legacy_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x07,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -348,6 +529,12 @@ v_mul_legacy_f32_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,
 v_mul_u32_u24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0b,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_mul_u32_u24_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0b,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_mul_u32_u24_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0b,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_mul_u32_u24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x0b,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -357,6 +544,12 @@ v_mul_u32_u24_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_or_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1c,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_or_b32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1c,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_or_b32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1c,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_or_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x1c,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -405,6 +598,12 @@ v_sub_co_ci_u32_e64_dpp v255, null, v255, v255, null clamp dpp8:[0,0,0,0,0,0,0,0
 v_sub_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x33,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_sub_f16_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x33,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_f16_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x33,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_sub_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x33,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -417,6 +616,12 @@ v_sub_f16_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_sub_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x04,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_sub_f32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x04,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_f32_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x04,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_sub_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x04,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -429,6 +634,12 @@ v_sub_f32_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_sub_nc_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x26,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_sub_nc_u32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x26,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x26,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_sub_nc_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x26,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -477,6 +688,12 @@ v_subrev_co_ci_u32_e64_dpp v255, null, v255, v255, null clamp dpp8:[0,0,0,0,0,0,
 v_subrev_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x34,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_subrev_f16_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x34,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_subrev_f16_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x34,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_subrev_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x34,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -489,6 +706,12 @@ v_subrev_f16_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] f
 v_subrev_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x05,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_subrev_f32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x05,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_subrev_f32_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x05,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_subrev_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x05,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -501,6 +724,12 @@ v_subrev_f32_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] f
 v_subrev_nc_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x27,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_subrev_nc_u32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x27,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_subrev_nc_u32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x27,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_subrev_nc_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x27,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -510,6 +739,12 @@ v_subrev_nc_u32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_xnor_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1e,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_xnor_b32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1e,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_xnor_b32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1e,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_xnor_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x1e,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -519,6 +754,12 @@ v_xnor_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_xor_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1d,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_xor_b32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1d,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_xor_b32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1d,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_xor_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x1d,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp16.s
index b50b18e..037fa39 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp16.s
@@ -7,6 +7,14 @@ v_cmp_class_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_class_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x7d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_class_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x7d,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_class_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -59,6 +67,14 @@ v_cmp_class_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_class_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_class_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x7d,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_class_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -114,6 +130,14 @@ v_cmp_class_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_class_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x7e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_class_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x7e,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_class_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -166,6 +190,14 @@ v_cmp_class_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_class_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_class_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x7e,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_class_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -221,6 +253,14 @@ v_cmp_eq_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x02,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x02,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -273,6 +313,14 @@ v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x02,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -328,6 +376,14 @@ v_cmp_eq_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x12,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x12,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -380,6 +436,14 @@ v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x12,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -435,6 +499,14 @@ v_cmp_eq_i16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_i16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x32,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_i16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x32,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_i16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -487,6 +559,14 @@ v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_i16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_i16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x32,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -542,6 +622,14 @@ v_cmp_eq_i32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_i32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x42,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_i32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x42,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_i32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -594,6 +682,14 @@ v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_i32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_i32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x42,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -649,6 +745,14 @@ v_cmp_eq_u16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_u16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x3a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_u16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x3a,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_u16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -701,6 +805,14 @@ v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_u16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_u16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x3a,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -756,6 +868,14 @@ v_cmp_eq_u32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_u32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x4a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_u32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x4a,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_u32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -808,6 +928,14 @@ v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_u32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_u32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x4a,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -863,6 +991,14 @@ v_cmp_ge_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x06,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x06,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -915,6 +1051,14 @@ v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x06,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -970,6 +1114,14 @@ v_cmp_ge_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x16,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x16,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1022,6 +1174,14 @@ v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x16,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1077,6 +1237,14 @@ v_cmp_ge_i16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_i16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x36,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_i16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x36,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_i16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1129,6 +1297,14 @@ v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_i16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_i16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x36,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1184,6 +1360,14 @@ v_cmp_ge_i32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_i32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x46,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_i32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x46,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_i32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1236,6 +1420,14 @@ v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_i32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_i32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x46,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1291,6 +1483,14 @@ v_cmp_ge_u16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_u16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x3e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_u16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x3e,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_u16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1343,6 +1543,14 @@ v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_u16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_u16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x3e,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1398,6 +1606,14 @@ v_cmp_ge_u32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_u32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x4e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_u32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x4e,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_u32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1450,6 +1666,14 @@ v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_u32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_u32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x4e,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1505,6 +1729,14 @@ v_cmp_gt_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x04,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x04,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1557,6 +1789,14 @@ v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x04,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1612,6 +1852,14 @@ v_cmp_gt_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x14,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x14,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1664,6 +1912,14 @@ v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x14,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1719,6 +1975,14 @@ v_cmp_gt_i16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_i16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x34,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_i16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x34,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_i16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1771,6 +2035,14 @@ v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_i16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_i16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x34,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1826,6 +2098,14 @@ v_cmp_gt_i32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_i32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x44,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_i32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x44,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_i32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1878,6 +2158,14 @@ v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_i32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_i32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x44,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1933,6 +2221,14 @@ v_cmp_gt_u16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_u16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x3c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_u16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x3c,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_u16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1985,6 +2281,14 @@ v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_u16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_u16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x3c,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2040,6 +2344,14 @@ v_cmp_gt_u32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_u32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x4c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_u32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x4c,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_u32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2092,6 +2404,14 @@ v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_u32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_u32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x4c,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2147,6 +2467,14 @@ v_cmp_le_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x03,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x03,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2199,6 +2527,14 @@ v_cmp_le_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x03,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2254,6 +2590,14 @@ v_cmp_le_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x13,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x13,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2306,6 +2650,14 @@ v_cmp_le_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x13,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2361,6 +2713,14 @@ v_cmp_le_i16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_i16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x33,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_i16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x33,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_i16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2413,6 +2773,14 @@ v_cmp_le_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_i16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_i16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x33,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2468,6 +2836,14 @@ v_cmp_le_i32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_i32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x43,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_i32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x43,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_i32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2520,6 +2896,14 @@ v_cmp_le_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_i32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_i32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x43,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2575,6 +2959,14 @@ v_cmp_le_u16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_u16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x3b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_u16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x3b,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_u16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2627,6 +3019,14 @@ v_cmp_le_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_u16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_u16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x3b,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2682,6 +3082,14 @@ v_cmp_le_u32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_u32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x4b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_u32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x4b,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_u32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2734,6 +3142,14 @@ v_cmp_le_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_u32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_u32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x4b,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2789,6 +3205,14 @@ v_cmp_lg_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lg_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x05,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lg_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x05,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lg_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2841,6 +3265,14 @@ v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lg_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lg_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x05,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2896,6 +3328,14 @@ v_cmp_lg_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lg_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x15,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lg_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x15,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lg_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2948,6 +3388,14 @@ v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lg_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lg_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x15,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3003,6 +3451,14 @@ v_cmp_lt_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x01,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x01,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3055,6 +3511,14 @@ v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x01,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3110,6 +3574,14 @@ v_cmp_lt_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x11,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x11,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3162,6 +3634,14 @@ v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x11,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3217,6 +3697,14 @@ v_cmp_lt_i16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_i16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x31,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_i16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x31,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_i16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3269,6 +3757,14 @@ v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_i16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_i16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x31,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3324,6 +3820,14 @@ v_cmp_lt_i32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_i32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x41,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_i32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x41,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_i32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3376,6 +3880,14 @@ v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_i32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_i32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x41,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3431,6 +3943,14 @@ v_cmp_lt_u16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_u16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x39,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_u16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x39,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_u16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3483,6 +4003,14 @@ v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_u16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_u16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x39,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3538,6 +4066,14 @@ v_cmp_lt_u32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_u32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x49,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_u32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x49,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_u32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3590,6 +4126,14 @@ v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_u32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_u32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x49,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3645,6 +4189,14 @@ v_cmp_ne_i16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_i16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x35,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_i16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x35,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_i16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3697,6 +4249,14 @@ v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_i16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_i16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x35,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3752,6 +4312,14 @@ v_cmp_ne_i32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_i32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x45,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_i32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x45,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_i32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3804,6 +4372,14 @@ v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_i32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_i32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x45,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3859,6 +4435,14 @@ v_cmp_ne_u16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_u16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x3d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_u16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x3d,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_u16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3911,6 +4495,14 @@ v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_u16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_u16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x3d,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3966,6 +4558,14 @@ v_cmp_ne_u32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_u32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x4d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_u32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x4d,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_u32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4018,6 +4618,14 @@ v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_u32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_u32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x4d,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4073,6 +4681,14 @@ v_cmp_neq_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_neq_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x0d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_neq_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x0d,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_neq_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4125,6 +4741,14 @@ v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_neq_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_neq_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x0d,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4180,6 +4804,14 @@ v_cmp_neq_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_neq_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x1d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_neq_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x1d,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_neq_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4232,6 +4864,14 @@ v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_neq_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_neq_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x1d,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4287,6 +4927,14 @@ v_cmp_nge_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nge_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x09,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nge_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x09,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nge_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4339,6 +4987,14 @@ v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nge_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nge_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x09,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4394,6 +5050,14 @@ v_cmp_nge_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nge_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x19,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nge_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x19,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nge_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4446,6 +5110,14 @@ v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nge_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nge_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x19,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4501,6 +5173,14 @@ v_cmp_ngt_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ngt_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x0b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ngt_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x0b,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ngt_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4553,6 +5233,14 @@ v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ngt_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ngt_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x0b,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4608,6 +5296,14 @@ v_cmp_ngt_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ngt_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x1b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ngt_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x1b,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ngt_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4660,6 +5356,14 @@ v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ngt_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ngt_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x1b,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4715,6 +5419,14 @@ v_cmp_nle_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nle_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x0c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nle_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x0c,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nle_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4767,6 +5479,14 @@ v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nle_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nle_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x0c,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4822,6 +5542,14 @@ v_cmp_nle_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nle_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x1c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nle_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x1c,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nle_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4874,6 +5602,14 @@ v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nle_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nle_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x1c,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4929,6 +5665,14 @@ v_cmp_nlg_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlg_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x0a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlg_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x0a,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlg_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4981,6 +5725,14 @@ v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlg_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlg_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x0a,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5036,6 +5788,14 @@ v_cmp_nlg_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlg_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x1a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlg_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x1a,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlg_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5088,6 +5848,14 @@ v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlg_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlg_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x1a,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5143,6 +5911,14 @@ v_cmp_nlt_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlt_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x0e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlt_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x0e,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlt_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5195,6 +5971,14 @@ v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlt_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlt_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x0e,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5250,6 +6034,14 @@ v_cmp_nlt_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlt_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x1e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlt_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x1e,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlt_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5302,6 +6094,14 @@ v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlt_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlt_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x1e,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5357,6 +6157,14 @@ v_cmp_o_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_o_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x07,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_o_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x07,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_o_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5409,6 +6217,14 @@ v_cmp_o_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_o_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_o_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x07,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_o_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5464,6 +6280,14 @@ v_cmp_o_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_o_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x17,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_o_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x17,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_o_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5516,6 +6340,14 @@ v_cmp_o_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_o_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_o_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x17,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_o_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5571,6 +6403,14 @@ v_cmp_u_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_u_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x08,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_u_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x08,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_u_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5623,6 +6463,14 @@ v_cmp_u_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_u_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_u_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x08,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_u_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5678,6 +6526,14 @@ v_cmp_u_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_u_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x18,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_u_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x18,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_u_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5730,6 +6586,14 @@ v_cmp_u_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_u_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_u_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x18,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_u_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp8.s
index b9dc614..c5ba45e 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp8.s
@@ -7,6 +7,14 @@ v_cmp_class_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_class_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x7d,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_class_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x7d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_class_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -27,6 +35,14 @@ v_cmp_class_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_class_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x7d,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_class_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x7d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_class_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -46,6 +62,14 @@ v_cmp_class_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_class_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x7e,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_class_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x7e,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_class_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -66,6 +90,14 @@ v_cmp_class_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_class_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x7e,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_class_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x7e,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_class_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -85,6 +117,14 @@ v_cmp_eq_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x02,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x02,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -105,6 +145,14 @@ v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x02,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x02,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -124,6 +172,14 @@ v_cmp_eq_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x12,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x12,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -144,6 +200,14 @@ v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x12,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x12,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -163,6 +227,14 @@ v_cmp_eq_i16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_i16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x32,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_i16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x32,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_i16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -183,6 +255,14 @@ v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_i16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x32,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_i16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x32,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -202,6 +282,14 @@ v_cmp_eq_i32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_i32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x42,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_i32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x42,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_i32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -222,6 +310,14 @@ v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_i32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x42,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_i32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x42,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -241,6 +337,14 @@ v_cmp_eq_u16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_u16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x3a,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_u16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x3a,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_u16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -261,6 +365,14 @@ v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_u16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x3a,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_u16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x3a,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -280,6 +392,14 @@ v_cmp_eq_u32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_u32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x4a,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_u32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x4a,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_u32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -300,6 +420,14 @@ v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_u32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x4a,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_u32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x4a,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -319,6 +447,14 @@ v_cmp_ge_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x06,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x06,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -339,6 +475,14 @@ v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x06,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x06,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -358,6 +502,14 @@ v_cmp_ge_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x16,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x16,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -378,6 +530,14 @@ v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x16,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x16,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -397,6 +557,14 @@ v_cmp_ge_i16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_i16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x36,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_i16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x36,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_i16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -417,6 +585,14 @@ v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_i16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x36,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_i16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x36,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -436,6 +612,14 @@ v_cmp_ge_i32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_i32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x46,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_i32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x46,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_i32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -456,6 +640,14 @@ v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_i32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x46,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_i32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x46,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -475,6 +667,14 @@ v_cmp_ge_u16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_u16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x3e,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_u16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x3e,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_u16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -495,6 +695,14 @@ v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_u16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x3e,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_u16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x3e,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -514,6 +722,14 @@ v_cmp_ge_u32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_u32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x4e,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_u32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x4e,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_u32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -534,6 +750,14 @@ v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_u32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x4e,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_u32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x4e,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -553,6 +777,14 @@ v_cmp_gt_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x04,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x04,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -573,6 +805,14 @@ v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x04,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x04,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -592,6 +832,14 @@ v_cmp_gt_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x14,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x14,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -612,6 +860,14 @@ v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x14,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x14,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -631,6 +887,14 @@ v_cmp_gt_i16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_i16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x34,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_i16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x34,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_i16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -651,6 +915,14 @@ v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_i16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x34,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_i16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x34,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -670,6 +942,14 @@ v_cmp_gt_i32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_i32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x44,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_i32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x44,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_i32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -690,6 +970,14 @@ v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_i32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x44,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_i32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x44,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -709,6 +997,14 @@ v_cmp_gt_u16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_u16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x3c,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_u16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x3c,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_u16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -729,6 +1025,14 @@ v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_u16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x3c,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_u16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x3c,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -748,6 +1052,14 @@ v_cmp_gt_u32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_u32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x4c,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_u32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x4c,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_u32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -768,6 +1080,14 @@ v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_u32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x4c,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_u32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x4c,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -787,6 +1107,14 @@ v_cmp_le_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x03,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x03,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -807,6 +1135,14 @@ v_cmp_le_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x03,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x03,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -826,6 +1162,14 @@ v_cmp_le_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x13,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x13,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -846,6 +1190,14 @@ v_cmp_le_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x13,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x13,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -865,6 +1217,14 @@ v_cmp_le_i16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_i16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x33,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_i16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x33,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_i16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -885,6 +1245,14 @@ v_cmp_le_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_i16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x33,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_i16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x33,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -904,6 +1272,14 @@ v_cmp_le_i32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_i32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x43,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_i32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x43,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_i32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -924,6 +1300,14 @@ v_cmp_le_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_i32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x43,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_i32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x43,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -943,6 +1327,14 @@ v_cmp_le_u16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_u16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x3b,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_u16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x3b,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_u16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -963,6 +1355,14 @@ v_cmp_le_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_u16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x3b,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_u16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x3b,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -982,6 +1382,14 @@ v_cmp_le_u32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_u32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x4b,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_u32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x4b,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_u32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1002,6 +1410,14 @@ v_cmp_le_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_u32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x4b,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_u32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x4b,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1021,6 +1437,14 @@ v_cmp_lg_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lg_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x05,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lg_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x05,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lg_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1041,6 +1465,14 @@ v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lg_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x05,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lg_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x05,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lg_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1060,6 +1492,14 @@ v_cmp_lg_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lg_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x15,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lg_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x15,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lg_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1080,6 +1520,14 @@ v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lg_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x15,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lg_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x15,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lg_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1099,6 +1547,14 @@ v_cmp_lt_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x01,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x01,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1119,6 +1575,14 @@ v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x01,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x01,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1138,6 +1602,14 @@ v_cmp_lt_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x11,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x11,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1158,6 +1630,14 @@ v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x11,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x11,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1177,6 +1657,14 @@ v_cmp_lt_i16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_i16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x31,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_i16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x31,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_i16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1197,6 +1685,14 @@ v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_i16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x31,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_i16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x31,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1216,6 +1712,14 @@ v_cmp_lt_i32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_i32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x41,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_i32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x41,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_i32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1236,6 +1740,14 @@ v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_i32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x41,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_i32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x41,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1255,6 +1767,14 @@ v_cmp_lt_u16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_u16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x39,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_u16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x39,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_u16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1275,6 +1795,14 @@ v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_u16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x39,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_u16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x39,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1294,6 +1822,14 @@ v_cmp_lt_u32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_u32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x49,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_u32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x49,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_u32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1314,6 +1850,14 @@ v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_u32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x49,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_u32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x49,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1333,6 +1877,14 @@ v_cmp_ne_i16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_i16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x35,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_i16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x35,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_i16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1353,6 +1905,14 @@ v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_i16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x35,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_i16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x35,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1372,6 +1932,14 @@ v_cmp_ne_i32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_i32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x45,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_i32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x45,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_i32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1392,6 +1960,14 @@ v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_i32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x45,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_i32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x45,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1411,6 +1987,14 @@ v_cmp_ne_u16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_u16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x3d,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_u16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x3d,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_u16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1431,6 +2015,14 @@ v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_u16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x3d,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_u16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x3d,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1450,6 +2042,14 @@ v_cmp_ne_u32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_u32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x4d,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_u32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x4d,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_u32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1470,6 +2070,14 @@ v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_u32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x4d,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_u32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x4d,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1489,6 +2097,14 @@ v_cmp_neq_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_neq_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x0d,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_neq_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x0d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_neq_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1509,6 +2125,14 @@ v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_neq_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x0d,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_neq_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x0d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_neq_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1528,6 +2152,14 @@ v_cmp_neq_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_neq_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x1d,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_neq_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x1d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_neq_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1548,6 +2180,14 @@ v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_neq_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x1d,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_neq_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x1d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_neq_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1567,6 +2207,14 @@ v_cmp_nge_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nge_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x09,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nge_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x09,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nge_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1587,6 +2235,14 @@ v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nge_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x09,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nge_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x09,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nge_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1606,6 +2262,14 @@ v_cmp_nge_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nge_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x19,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nge_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x19,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nge_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1626,6 +2290,14 @@ v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nge_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x19,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nge_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x19,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nge_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1645,6 +2317,14 @@ v_cmp_ngt_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ngt_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x0b,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ngt_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x0b,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ngt_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1665,6 +2345,14 @@ v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ngt_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x0b,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ngt_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x0b,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ngt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1684,6 +2372,14 @@ v_cmp_ngt_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ngt_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x1b,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ngt_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x1b,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ngt_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1704,6 +2400,14 @@ v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ngt_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x1b,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ngt_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x1b,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ngt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1723,6 +2427,14 @@ v_cmp_nle_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nle_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x0c,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nle_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x0c,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nle_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1743,6 +2455,14 @@ v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nle_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x0c,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nle_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x0c,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nle_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1762,6 +2482,14 @@ v_cmp_nle_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nle_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x1c,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nle_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x1c,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nle_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1782,6 +2510,14 @@ v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nle_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x1c,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nle_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x1c,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nle_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1801,6 +2537,14 @@ v_cmp_nlg_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlg_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x0a,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlg_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x0a,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlg_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1821,6 +2565,14 @@ v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlg_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x0a,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlg_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x0a,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlg_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1840,6 +2592,14 @@ v_cmp_nlg_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlg_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x1a,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlg_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x1a,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlg_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1860,6 +2620,14 @@ v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlg_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x1a,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlg_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x1a,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlg_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1879,6 +2647,14 @@ v_cmp_nlt_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlt_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x0e,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlt_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x0e,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlt_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1899,6 +2675,14 @@ v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlt_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x0e,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlt_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x0e,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1918,6 +2702,14 @@ v_cmp_nlt_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlt_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x1e,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlt_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x1e,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlt_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1938,6 +2730,14 @@ v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlt_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x1e,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlt_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x1e,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1957,6 +2757,14 @@ v_cmp_o_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_o_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x07,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_o_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x07,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_o_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1977,6 +2785,14 @@ v_cmp_o_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_o_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x07,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_o_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x07,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_o_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1996,6 +2812,14 @@ v_cmp_o_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_o_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x17,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_o_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x17,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_o_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2016,6 +2840,14 @@ v_cmp_o_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_o_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x17,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_o_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x17,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_o_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2035,6 +2867,14 @@ v_cmp_u_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_u_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x08,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_u_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x08,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_u_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2055,6 +2895,14 @@ v_cmp_u_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_u_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x08,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_u_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x08,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_u_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2074,6 +2922,14 @@ v_cmp_u_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_u_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x18,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_u_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x18,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_u_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2094,6 +2950,14 @@ v_cmp_u_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_u_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x18,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_u_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x18,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_u_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s
index 03958ba..eae2d5b2 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s
@@ -4,6 +4,12 @@
 v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_class_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_class_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xfd,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -46,6 +52,12 @@ v_cmpx_class_f16_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 b
 v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_class_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_class_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xfe,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -88,6 +100,12 @@ v_cmpx_class_f32_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 b
 v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_eq_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_eq_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x82,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -130,6 +148,12 @@ v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_eq_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_eq_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x92,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -172,6 +196,12 @@ v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_eq_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_eq_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb2,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -214,6 +244,12 @@ v_cmpx_eq_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_eq_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_eq_i32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_eq_i32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc2,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_eq_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -256,6 +292,12 @@ v_cmpx_eq_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_eq_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_eq_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xba,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -298,6 +340,12 @@ v_cmpx_eq_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_eq_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_eq_u32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_eq_u32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xca,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_eq_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -340,6 +388,12 @@ v_cmpx_eq_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ge_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ge_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x86,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -382,6 +436,12 @@ v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ge_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ge_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x96,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -424,6 +484,12 @@ v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ge_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ge_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb6,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -466,6 +532,12 @@ v_cmpx_ge_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_ge_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ge_i32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ge_i32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc6,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ge_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -508,6 +580,12 @@ v_cmpx_ge_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ge_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ge_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xbe,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -550,6 +628,12 @@ v_cmpx_ge_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_ge_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ge_u32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ge_u32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xce,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ge_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -592,6 +676,12 @@ v_cmpx_ge_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_gt_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_gt_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x84,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -634,6 +724,12 @@ v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_gt_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_gt_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x94,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -676,6 +772,12 @@ v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_gt_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_gt_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb4,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -718,6 +820,12 @@ v_cmpx_gt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_gt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_gt_i32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_gt_i32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc4,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_gt_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -760,6 +868,12 @@ v_cmpx_gt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_gt_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_gt_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xbc,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -802,6 +916,12 @@ v_cmpx_gt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_gt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_gt_u32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_gt_u32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xcc,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_gt_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -844,6 +964,12 @@ v_cmpx_gt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_le_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_le_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x83,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -886,6 +1012,12 @@ v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_le_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_le_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x93,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -928,6 +1060,12 @@ v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_le_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_le_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb3,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -970,6 +1108,12 @@ v_cmpx_le_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_le_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_le_i32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_le_i32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc3,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_le_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1012,6 +1156,12 @@ v_cmpx_le_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_le_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_le_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xbb,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1054,6 +1204,12 @@ v_cmpx_le_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_le_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_le_u32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_le_u32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xcb,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_le_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1096,6 +1252,12 @@ v_cmpx_le_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_lg_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_lg_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x85,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1138,6 +1300,12 @@ v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_lg_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_lg_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x95,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1180,6 +1348,12 @@ v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_lt_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_lt_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x81,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1222,6 +1396,12 @@ v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_lt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_lt_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_lt_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x91,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_lt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1264,6 +1444,12 @@ v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_lt_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_lt_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb1,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1306,6 +1492,12 @@ v_cmpx_lt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_lt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_lt_i32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_lt_i32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc1,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_lt_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1348,6 +1540,12 @@ v_cmpx_lt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_lt_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_lt_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb9,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1390,6 +1588,12 @@ v_cmpx_lt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_lt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_lt_u32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_lt_u32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc9,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_lt_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1432,6 +1636,12 @@ v_cmpx_lt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ne_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ne_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb5,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1474,6 +1684,12 @@ v_cmpx_ne_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_ne_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ne_i32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ne_i32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc5,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ne_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1516,6 +1732,12 @@ v_cmpx_ne_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ne_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ne_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xbd,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1558,6 +1780,12 @@ v_cmpx_ne_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_ne_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ne_u32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ne_u32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xcd,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ne_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1600,6 +1828,12 @@ v_cmpx_ne_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_neq_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_neq_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x8d,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1642,6 +1876,12 @@ v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_neq_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_neq_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x9d,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1684,6 +1924,12 @@ v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_nge_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nge_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x89,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1726,6 +1972,12 @@ v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_nge_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nge_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x99,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1768,6 +2020,12 @@ v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ngt_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ngt_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x8b,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1810,6 +2068,12 @@ v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ngt_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ngt_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x9b,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1852,6 +2116,12 @@ v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_nle_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nle_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x8c,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1894,6 +2164,12 @@ v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_nle_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nle_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x9c,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1936,6 +2212,12 @@ v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_nlg_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nlg_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x8a,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1978,6 +2260,12 @@ v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_nlg_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nlg_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x9a,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -2020,6 +2308,12 @@ v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_nlt_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nlt_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x8e,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -2062,6 +2356,12 @@ v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_nlt_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nlt_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x9e,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -2104,6 +2404,12 @@ v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_o_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_o_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x87,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -2146,6 +2452,12 @@ v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:
 v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_o_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_o_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x97,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -2188,6 +2500,12 @@ v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:
 v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_u_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_u_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x88,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -2230,6 +2548,12 @@ v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:
 v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_u_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_u_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x98,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s
index efc6168..d63ca0c 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s
@@ -7,6 +7,12 @@ v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xfd,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_class_f16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xfd,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_class_f16_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xfd,0xd4,0xea,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_class_f16_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x01,0xfd,0xd4,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 
@@ -16,6 +22,12 @@ v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xfe,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_class_f32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xfe,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_class_f32_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xfe,0xd4,0xea,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_class_f32_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x01,0xfe,0xd4,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 
@@ -28,6 +40,12 @@ v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x82,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_eq_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x82,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_eq_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x82,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x82,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -40,6 +58,12 @@ v_cmpx_eq_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_eq_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x92,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_eq_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x92,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_eq_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x92,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x92,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -49,6 +73,12 @@ v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xb2,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_eq_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb2,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_eq_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb2,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_eq_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xb2,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -58,6 +88,12 @@ v_cmpx_eq_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_eq_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xc2,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_eq_i32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc2,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_eq_i32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc2,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_eq_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xc2,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -67,6 +103,12 @@ v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xba,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_eq_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xba,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_eq_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xba,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_eq_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xba,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -76,6 +118,12 @@ v_cmpx_eq_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_eq_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xca,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_eq_u32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xca,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_eq_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xca,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_eq_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xca,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -88,6 +136,12 @@ v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x86,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_ge_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x86,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_ge_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x86,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x86,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -100,6 +154,12 @@ v_cmpx_ge_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ge_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x96,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_ge_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x96,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_ge_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x96,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x96,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -109,6 +169,12 @@ v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xb6,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_ge_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb6,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ge_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb6,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_ge_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xb6,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -118,6 +184,12 @@ v_cmpx_ge_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ge_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xc6,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_ge_i32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc6,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ge_i32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc6,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_ge_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xc6,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -127,6 +199,12 @@ v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xbe,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_ge_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xbe,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ge_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xbe,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_ge_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xbe,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -136,6 +214,12 @@ v_cmpx_ge_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ge_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xce,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_ge_u32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xce,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ge_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xce,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_ge_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xce,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -148,6 +232,12 @@ v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x84,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_gt_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x84,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_gt_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x84,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x84,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -160,6 +250,12 @@ v_cmpx_gt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_gt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x94,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_gt_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x94,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_gt_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x94,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x94,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -169,6 +265,12 @@ v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xb4,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_gt_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb4,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_gt_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb4,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_gt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xb4,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -178,6 +280,12 @@ v_cmpx_gt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_gt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xc4,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_gt_i32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc4,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_gt_i32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc4,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_gt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xc4,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -187,6 +295,12 @@ v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xbc,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_gt_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xbc,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_gt_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xbc,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_gt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xbc,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -196,6 +310,12 @@ v_cmpx_gt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_gt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xcc,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_gt_u32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xcc,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_gt_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xcc,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_gt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xcc,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -208,6 +328,12 @@ v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x83,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_le_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x83,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x83,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x83,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -220,6 +346,12 @@ v_cmpx_le_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_le_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x93,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_le_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x93,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x93,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x93,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -229,6 +361,12 @@ v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xb3,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_le_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb3,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb3,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_le_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xb3,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -238,6 +376,12 @@ v_cmpx_le_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_le_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xc3,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_le_i32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc3,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_i32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc3,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_le_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xc3,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -247,6 +391,12 @@ v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xbb,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_le_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xbb,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xbb,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_le_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xbb,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -256,6 +406,12 @@ v_cmpx_le_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_le_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xcb,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_le_u32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xcb,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xcb,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_le_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xcb,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -268,6 +424,12 @@ v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x85,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_lg_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x85,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_lg_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x85,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x85,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -280,6 +442,12 @@ v_cmpx_lg_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_lg_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x95,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_lg_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x95,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_lg_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x95,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x95,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -292,6 +460,12 @@ v_cmpx_lt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_lt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x81,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_lt_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x81,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_lt_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x81,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x81,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -304,6 +478,12 @@ v_cmpx_lt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_lt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x91,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_lt_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x91,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_lt_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x91,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x91,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -313,6 +493,12 @@ v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xb1,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_lt_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb1,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_lt_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb1,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_lt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xb1,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -322,6 +508,12 @@ v_cmpx_lt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_lt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xc1,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_lt_i32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc1,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_lt_i32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc1,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_lt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xc1,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -331,6 +523,12 @@ v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xb9,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_lt_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb9,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_lt_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb9,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_lt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xb9,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -340,6 +538,12 @@ v_cmpx_lt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_lt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xc9,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_lt_u32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc9,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_lt_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc9,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_lt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xc9,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -349,6 +553,12 @@ v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xb5,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_ne_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb5,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ne_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb5,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_ne_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xb5,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -358,6 +568,12 @@ v_cmpx_ne_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ne_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xc5,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_ne_i32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc5,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ne_i32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc5,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_ne_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xc5,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -367,6 +583,12 @@ v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xbd,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_ne_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xbd,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ne_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xbd,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_ne_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xbd,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -376,6 +598,12 @@ v_cmpx_ne_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ne_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xcd,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_ne_u32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xcd,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ne_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xcd,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_ne_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xcd,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -388,6 +616,12 @@ v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x8d,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_neq_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x8d,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_neq_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x8d,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x8d,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -400,6 +634,12 @@ v_cmpx_neq_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_neq_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x9d,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_neq_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x9d,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_neq_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x9d,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x9d,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -412,6 +652,12 @@ v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x89,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_nge_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x89,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nge_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x89,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x89,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -424,6 +670,12 @@ v_cmpx_nge_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_nge_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x99,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_nge_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x99,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nge_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x99,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x99,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -436,6 +688,12 @@ v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x8b,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_ngt_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x8b,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_ngt_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x8b,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x8b,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -448,6 +706,12 @@ v_cmpx_ngt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ngt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x9b,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_ngt_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x9b,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_ngt_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x9b,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x9b,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -460,6 +724,12 @@ v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x8c,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_nle_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x8c,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nle_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x8c,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x8c,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -472,6 +742,12 @@ v_cmpx_nle_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_nle_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x9c,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_nle_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x9c,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nle_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x9c,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x9c,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -484,6 +760,12 @@ v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x8a,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_nlg_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x8a,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlg_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x8a,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x8a,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -496,6 +778,12 @@ v_cmpx_nlg_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_nlg_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x9a,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_nlg_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x9a,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlg_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x9a,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x9a,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -508,6 +796,12 @@ v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x8e,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_nlt_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x8e,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlt_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x8e,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x8e,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -520,6 +814,12 @@ v_cmpx_nlt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_nlt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x9e,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_nlt_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x9e,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlt_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x9e,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x9e,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -532,6 +832,12 @@ v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x87,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_o_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x87,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_o_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x87,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x87,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -544,6 +850,12 @@ v_cmpx_o_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_o_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x97,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_o_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x97,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_o_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x97,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x97,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -556,6 +868,12 @@ v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x88,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_u_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x88,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_u_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x88,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x88,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -568,5 +886,11 @@ v_cmpx_u_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_u_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x98,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_u_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x98,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_u_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x98,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_u_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x98,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/vop_dpp.s b/llvm/test/MC/AMDGPU/vop_dpp.s
index b2251f5..a15a48e 100644
--- a/llvm/test/MC/AMDGPU/vop_dpp.s
+++ b/llvm/test/MC/AMDGPU/vop_dpp.s
@@ -648,8 +648,8 @@ v_mov_b32 v0, s1 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
 v_and_b32 v0, s42, v1 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
 
 // NOSICI: :[[@LINE+3]]:{{[0-9]+}}: error: not a valid operand.
-// NOVI: :[[@LINE+2]]:{{[0-9]+}}: error: invalid operand for instruction
-// NOGFX9: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+// NOVI: :[[@LINE+2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOGFX9: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 v_add_f32 v0, v1, s45 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_features.txt
index 6ab3e08..52426d3 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_features.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_features.txt
@@ -17,3 +17,12 @@
 
 # GFX1150: v_cmp_ne_i32_e64_dpp vcc_lo, v1, s2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x45,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
 0x6a,0x00,0x45,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05
+
+# GFX1150: v_add_f32_e64_dpp v5, v1, s2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x00,0x00,0x01,0x40,0x01,0xff]
+0x05,0x00,0x03,0xd5,0xfa,0x04,0x00,0x00,0x01,0x40,0x01,0xff
+
+# GFX1150: v_min3_f16_e64_dpp v5, v1, s2, 2.0 op_sel:[1,1,0,1] quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x49,0xd6,0xfa,0x04,0xd0,0x03,0x01,0x55,0x00,0xff]
+0x05,0x58,0x49,0xd6,0xfa,0x04,0xd0,0x03,0x01,0x55,0x00,0xff
+
+# GFX1150: v_cmp_le_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x40,0x01,0xff]
+0xfa,0x04,0x26,0x7c,0x01,0x40,0x01,0xff
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_features.txt
index 1be97b2..1d69134 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_features.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_features.txt
@@ -22,3 +22,7 @@
 # This is more strict than the check in vinterp-fake16.txt and is GFX12 specific.
 # GFX12: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x04]
 0x00,0x00,0xe0,0xcd,0x01,0x05,0x0e,0x1c
+
+# Regression test for future fixes to VOPC _e64_dpp src1
+# GFX12: v_cmp_le_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x40,0x01,0xff]
+0xfa,0x04,0x26,0x7c,0x01,0x40,0x01,0xff
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
index 4303c6d..0771e64 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
@@ -4,6 +4,12 @@
 # GFX12: v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x55,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_add3_u32_e64_dpp v5, v1, 15, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x1e,0x0d,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x55,0xd6,0xfa,0x1e,0x0d,0x04,0x01,0x1b,0x00,0xff
+
+# GFX12: v_add3_u32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x55,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x55,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -101,6 +107,9 @@
 # GFX12: v_add_lshl_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x47,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_add_lshl_u32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x47,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_add_lshl_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x47,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -185,6 +194,9 @@
 # GFX12: v_alignbit_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_alignbit_b32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x16,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_alignbit_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -227,6 +239,9 @@
 # GFX12: v_alignbyte_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x17,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_alignbyte_b32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x17,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_alignbyte_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x17,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -311,6 +326,9 @@
 # GFX12: v_and_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x57,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x57,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_and_or_b32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x57,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x57,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_and_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x57,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x57,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -437,6 +455,9 @@
 # GFX12: v_bfe_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x11,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_bfe_i32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x11,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_bfe_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x11,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -479,6 +500,9 @@
 # GFX12: v_bfe_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x10,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_bfe_u32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x10,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_bfe_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x10,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -521,6 +545,9 @@
 # GFX12: v_bfi_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x12,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_bfi_b32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x12,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_bfi_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x12,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -660,6 +687,9 @@
 # GFX12: v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_cubeid_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x0c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -702,6 +732,9 @@
 # GFX12: v_cubema_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_cubema_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x0f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_cubema_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -744,6 +777,9 @@
 # GFX12: v_cubesc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_cubesc_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x0d,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_cubesc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -786,6 +822,9 @@
 # GFX12: v_cubetc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_cubetc_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x0e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_cubetc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1104,6 +1143,9 @@
 # GFX12: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x26,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_cvt_pk_u8_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x26,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x26,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1230,6 +1272,9 @@
 # GFX12: v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x13,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_fma_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x13,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x13,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1314,6 +1359,9 @@
 # GFX12: v_lerp_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x15,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_lerp_u8_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x15,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_lerp_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x15,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1356,6 +1404,9 @@
 # GFX12: v_lshl_add_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x46,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_lshl_add_u32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x46,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_lshl_add_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x46,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1398,6 +1449,9 @@
 # GFX12: v_lshl_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x56,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_lshl_or_b32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x56,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_lshl_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x56,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1524,6 +1578,9 @@
 # GFX12: v_mad_i32_i24_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mad_i32_i24_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x0a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mad_i32_i24_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1566,6 +1623,9 @@
 # GFX12: v_mad_u32_u24_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mad_u32_u24_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x0b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mad_u32_u24_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1608,6 +1668,9 @@
 # GFX12: v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_max3_num_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x2a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1650,6 +1713,9 @@
 # GFX12: v_max3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x1d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_max3_i32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x1d,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_max3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x1d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1668,6 +1734,9 @@
 # GFX12: v_max3_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 0x05,0x00,0x1d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
 
+# GFX12: v_max3_i32_e64_dpp v5, v1, 15, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x1e,0xa9,0x01,0x01,0x11,0x01,0xff]
+0x05,0x00,0x1d,0xd6,0xfa,0x1e,0xa9,0x01,0x01,0x11,0x01,0xff
+
 # GFX12: v_max3_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x1d,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff
 
@@ -1692,6 +1761,9 @@
 # GFX12: v_max3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x1e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_max3_u32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x1e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_max3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x1e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1818,6 +1890,12 @@
 # GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
+# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1860,6 +1938,9 @@
 # GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_maxmin_num_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x69,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1902,6 +1983,9 @@
 # GFX12: v_maxmin_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x64,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_maxmin_i32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x64,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_maxmin_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x64,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1944,6 +2028,9 @@
 # GFX12: v_maxmin_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x62,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_maxmin_u32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x62,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_maxmin_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x62,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2070,6 +2157,9 @@
 # GFX12: v_med3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x31,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_med3_num_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x31,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_med3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x31,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2112,6 +2202,9 @@
 # GFX12: v_med3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x20,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_med3_i32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x20,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_med3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x20,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2154,6 +2247,9 @@
 # GFX12: v_med3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x21,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_med3_u32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x21,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_med3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x21,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2196,6 +2292,9 @@
 # GFX12: v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x29,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_min3_num_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x29,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x29,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2238,6 +2337,9 @@
 # GFX12: v_min3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x1a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_min3_i32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x1a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_min3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x1a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2280,6 +2382,9 @@
 # GFX12: v_min3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x1b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_min3_u32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x1b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_min3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x1b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2406,6 +2511,9 @@
 # GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2448,6 +2556,9 @@
 # GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_minmax_num_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x68,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2490,6 +2601,9 @@
 # GFX12: v_minmax_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x65,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_minmax_i32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x65,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_minmax_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x65,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2532,6 +2646,9 @@
 # GFX12: v_minmax_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x63,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_minmax_u32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x63,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_minmax_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x63,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2574,6 +2691,9 @@
 # GFX12: v_msad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x39,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_msad_u8_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x39,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_msad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x39,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2658,6 +2778,9 @@
 # GFX12: v_mullit_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mullit_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x18,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mullit_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2700,6 +2823,9 @@
 # GFX12: v_or3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x58,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_or3_b32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x58,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_or3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x58,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2784,6 +2910,9 @@
 # GFX12: v_perm_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x44,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_perm_b32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x44,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_perm_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x44,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2826,6 +2955,9 @@
 # GFX12: v_sad_hi_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x23,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_sad_hi_u8_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x23,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_sad_hi_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x23,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2868,6 +3000,9 @@
 # GFX12: v_sad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x24,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_sad_u16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x24,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_sad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x24,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2910,6 +3045,9 @@
 # GFX12: v_sad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x25,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_sad_u32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x25,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_sad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x25,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2952,6 +3090,9 @@
 # GFX12: v_sad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x22,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_sad_u8_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x22,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_sad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x22,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -3146,6 +3287,9 @@
 # GFX12: v_xad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x45,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_xad_u32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x45,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_xad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x45,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -3188,6 +3332,9 @@
 # GFX12: v_xor3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x40,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_xor3_b32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x40,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_xor3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x40,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -3440,6 +3587,9 @@
 # GFX12: v_div_fixup_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x54,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x54,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_div_fixup_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x54,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x54,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_div_fixup_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x54,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x54,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
 
@@ -3482,6 +3632,9 @@
 # GFX12: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_fma_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x48,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_fma_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
 
@@ -3524,6 +3677,9 @@
 # GFX12: v_mad_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x53,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mad_i16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x53,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mad_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x53,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -3566,6 +3722,9 @@
 # GFX12: v_mad_i32_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x5a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mad_i32_i16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x5a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mad_i32_i16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x5a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
 
@@ -3608,6 +3767,9 @@
 # GFX12: v_mad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x41,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x41,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mad_u16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x41,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x41,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x41,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x41,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -3650,6 +3812,9 @@
 # GFX12: v_mad_u32_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x59,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mad_u32_u16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x59,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mad_u32_u16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x59,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
 
@@ -3692,6 +3857,9 @@
 # GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_max3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
 
@@ -3734,6 +3902,9 @@
 # GFX12: v_max3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_max3_i16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4d,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x4d,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_max3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -3776,6 +3947,9 @@
 # GFX12: v_max3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x4e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_max3_u16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x4e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_max3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x4e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -3818,6 +3992,9 @@
 # GFX12: v_med3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x32,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x32,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_med3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x32,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x32,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_med3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x32,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x32,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
 
@@ -3860,6 +4037,9 @@
 # GFX12: v_med3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x50,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x50,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_med3_i16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x50,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x50,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_med3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x50,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x50,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -3902,6 +4082,9 @@
 # GFX12: v_med3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x51,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x51,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_med3_u16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x51,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x51,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_med3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x51,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x51,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -3944,6 +4127,9 @@
 # GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_min3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
 
@@ -3986,6 +4172,9 @@
 # GFX12: v_min3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_min3_i16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x4a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_min3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -4028,6 +4217,9 @@
 # GFX12: v_min3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x4b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_min3_u16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x4b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_min3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x4b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -4417,6 +4609,9 @@
 # GFX12: v_maximum3_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x2e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_maximum3_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x2e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_maximum3_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 0x05,0x00,0x2e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
 
@@ -4459,6 +4654,9 @@
 # GFX12: v_minimum3_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x2d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_minimum3_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2d,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x2d,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_minimum3_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 0x05,0x00,0x2d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
 
@@ -4501,6 +4699,9 @@
 # GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_maximum3_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x30,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
 
@@ -4543,6 +4744,9 @@
 # GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_minimum3_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x2f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
 
@@ -4585,6 +4789,9 @@
 # GFX12: v_maximumminimum_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x6d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_maximumminimum_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x6d,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_maximumminimum_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 0x05,0x00,0x6d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
 
@@ -4627,6 +4834,9 @@
 # GFX12: v_minimummaximum_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x6c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_minimummaximum_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x6c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_minimummaximum_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 0x05,0x00,0x6c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
 
@@ -4669,6 +4879,9 @@
 # GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_maximumminimum_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x6f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
 
@@ -4711,6 +4924,9 @@
 # GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_minimummaximum_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x6e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
index c73ffe7..a836ada 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
@@ -4,6 +4,12 @@
 # GFX12: v_add3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x55,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_add3_u32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x55,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
+# GFX12: v_add3_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x55,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_add3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x55,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -59,6 +65,9 @@
 # GFX12: v_add_lshl_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x47,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_add_lshl_u32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x47,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_add_lshl_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x47,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -101,6 +110,9 @@
 # GFX12: v_alignbit_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x16,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_alignbit_b32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x16,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_alignbit_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x16,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -134,6 +146,9 @@
 # GFX12: v_alignbyte_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x17,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_alignbyte_b32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x17,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_alignbyte_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x17,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -173,6 +188,9 @@
 # GFX12: v_and_or_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x57,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x57,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_and_or_b32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x57,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x57,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_and_or_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x57,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x57,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -221,6 +239,9 @@
 # GFX12: v_bfe_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x11,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_bfe_i32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x11,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_bfe_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x11,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -257,6 +278,9 @@
 # GFX12: v_bfe_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x10,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_bfe_u32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x10,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_bfe_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x10,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -293,6 +317,9 @@
 # GFX12: v_bfi_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x12,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_bfi_b32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x12,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_bfi_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x12,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -354,6 +381,9 @@
 # GFX12: v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_cubeid_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x0c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_cubeid_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -390,6 +420,9 @@
 # GFX12: v_cubema_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_cubema_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x0f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_cubema_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -426,6 +459,9 @@
 # GFX12: v_cubesc_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_cubesc_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x0d,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_cubesc_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -462,6 +498,9 @@
 # GFX12: v_cubetc_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_cubetc_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x0e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_cubetc_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -582,6 +621,9 @@
 # GFX12: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x26,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_cvt_pk_u8_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x26,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x26,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -642,6 +684,9 @@
 # GFX12: v_fma_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x13,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_fma_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x13,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_fma_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x13,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -690,6 +735,9 @@
 # GFX12: v_lerp_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x15,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_lerp_u8_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x15,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_lerp_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x15,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -726,6 +774,9 @@
 # GFX12: v_lshl_add_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x46,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_lshl_add_u32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x46,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_lshl_add_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x46,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -762,6 +813,9 @@
 # GFX12: v_lshl_or_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x56,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_lshl_or_b32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x56,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_lshl_or_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x56,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -810,6 +864,9 @@
 # GFX12: v_mad_i32_i24_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_mad_i32_i24_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x0a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_mad_i32_i24_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -846,6 +903,9 @@
 # GFX12: v_mad_u32_u24_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_mad_u32_u24_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x0b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_mad_u32_u24_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -882,6 +942,9 @@
 # GFX12: v_max3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_max3_num_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x2a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_max3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -918,6 +981,9 @@
 # GFX12: v_max3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_max3_i32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x1d,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_max3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -954,6 +1020,9 @@
 # GFX12: v_max3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_max3_u32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x1e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_max3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1002,6 +1071,9 @@
 # GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1038,6 +1110,9 @@
 # GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_maxmin_num_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x69,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x69,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1074,6 +1149,9 @@
 # GFX12: v_maxmin_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x64,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_maxmin_i32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x64,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_maxmin_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x64,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1110,6 +1188,9 @@
 # GFX12: v_maxmin_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x62,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_maxmin_u32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x62,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_maxmin_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x62,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1158,6 +1239,9 @@
 # GFX12: v_med3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x31,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x31,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_med3_num_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x31,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x31,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_med3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x31,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x31,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1194,6 +1278,9 @@
 # GFX12: v_med3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x20,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_med3_i32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x20,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_med3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x20,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1230,6 +1317,9 @@
 # GFX12: v_med3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x21,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_med3_u32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x21,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_med3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x21,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1266,6 +1356,9 @@
 # GFX12: v_min3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x29,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x29,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_min3_num_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x29,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x29,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_min3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x29,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x29,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1302,6 +1395,9 @@
 # GFX12: v_min3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_min3_i32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x1a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_min3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1338,6 +1434,9 @@
 # GFX12: v_min3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_min3_u32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x1b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_min3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1386,6 +1485,9 @@
 # GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1422,6 +1524,9 @@
 # GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x68,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_minmax_num_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x68,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x68,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1458,6 +1563,9 @@
 # GFX12: v_minmax_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x65,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_minmax_i32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x65,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_minmax_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x65,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1494,6 +1602,9 @@
 # GFX12: v_minmax_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x63,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_minmax_u32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x63,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_minmax_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x63,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1530,6 +1641,9 @@
 # GFX12: v_msad_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x39,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_msad_u8_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x39,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_msad_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x39,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1572,6 +1686,9 @@
 # GFX12: v_mullit_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x18,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x18,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_mullit_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x18,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x18,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_mullit_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x18,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x18,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1608,6 +1725,9 @@
 # GFX12: v_or3_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x58,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_or3_b32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x58,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_or3_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x58,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1650,6 +1770,9 @@
 # GFX12: v_perm_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x44,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_perm_b32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x44,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_perm_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x44,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1686,6 +1809,9 @@
 # GFX12: v_sad_hi_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x23,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_sad_hi_u8_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x23,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_sad_hi_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x23,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1722,6 +1848,9 @@
 # GFX12: v_sad_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x24,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_sad_u16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x24,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_sad_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x24,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1758,6 +1887,9 @@
 # GFX12: v_sad_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x25,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_sad_u32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x25,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_sad_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x25,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1794,6 +1926,9 @@
 # GFX12: v_sad_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x22,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_sad_u8_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x22,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_sad_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x22,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1874,6 +2009,9 @@
 # GFX12: v_xad_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x45,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_xad_u32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x45,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_xad_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x45,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1910,6 +2048,9 @@
 # GFX12: v_xor3_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x40,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_xor3_b32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x40,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_xor3_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x40,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2006,6 +2147,9 @@
 # GFX12: v_div_fixup_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x54,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x54,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_div_fixup_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x54,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x54,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_div_fixup_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x54,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x54,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2048,6 +2192,12 @@
 # GFX12: v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_fma_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x48,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
+# GFX12: v_fma_f16_e64_dpp v5, v1, 4.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0xec,0x0d,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x48,0xd6,0xe9,0xec,0x0d,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2090,6 +2240,9 @@
 # GFX12: v_mad_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x53,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_mad_i16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x53,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_mad_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x53,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2129,6 +2282,9 @@
 # GFX12: v_mad_i32_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x5a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_mad_i32_i16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x5a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_mad_i32_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x5a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2171,6 +2327,9 @@
 # GFX12: v_mad_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x41,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x41,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_mad_u16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x41,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x41,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_mad_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x41,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x41,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2210,6 +2369,9 @@
 # GFX12: v_mad_u32_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x59,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_mad_u32_u16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x59,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_mad_u32_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x59,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2252,6 +2414,9 @@
 # GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_max3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2294,6 +2459,9 @@
 # GFX12: v_max3_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x4d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_max3_i16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4d,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x4d,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_max3_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x4d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2333,6 +2501,9 @@
 # GFX12: v_max3_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x4e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_max3_u16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x4e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_max3_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x4e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2372,6 +2543,9 @@
 # GFX12: v_med3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x32,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x32,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_med3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x32,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x32,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_med3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x32,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x32,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2414,6 +2588,9 @@
 # GFX12: v_med3_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x50,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x50,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_med3_i16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x50,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x50,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_med3_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x50,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x50,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2453,6 +2630,9 @@
 # GFX12: v_med3_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x51,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x51,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_med3_u16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x51,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x51,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_med3_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x51,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x51,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2492,6 +2672,9 @@
 # GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_min3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2534,6 +2717,9 @@
 # GFX12: v_min3_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x4a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_min3_i16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x4a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_min3_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x4a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2573,6 +2759,9 @@
 # GFX12: v_min3_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x4b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_min3_u16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x4b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_min3_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x4b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2752,6 +2941,9 @@
 # GFX12: v_maximum3_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_maximum3_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x2e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_maximum3_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
 
@@ -2788,6 +2980,9 @@
 # GFX12: v_minimum3_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_minimum3_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2d,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x2d,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_minimum3_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
 
@@ -2824,6 +3019,9 @@
 # GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x30,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_maximum3_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x30,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 0x05,0x00,0x30,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
 
@@ -2860,6 +3058,9 @@
 # GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_minimum3_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x2f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
 
@@ -2896,6 +3097,9 @@
 # GFX12: v_maximumminimum_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_maximumminimum_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x6d,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_maximumminimum_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
 
@@ -2932,6 +3136,9 @@
 # GFX12: v_minimummaximum_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_minimummaximum_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x6c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_minimummaximum_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
 
@@ -2968,6 +3175,9 @@
 # GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_maximumminimum_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x6f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
 
@@ -3004,6 +3214,9 @@
 # GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_minimummaximum_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x6e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp16.txt
index 56d7805b..b10b8da 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp16.txt
@@ -59,6 +59,9 @@
 # GFX12: v_add_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x32,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x32,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_add_f16_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x32,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x32,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_add_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x32,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x32,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -101,6 +104,9 @@
 # GFX12: v_add_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_add_f32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x03,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_add_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -143,6 +149,9 @@
 # GFX12: v_add_nc_u32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x25,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_add_nc_u32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x25,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_add_nc_u32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x25,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -185,6 +194,9 @@
 # GFX12: v_and_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x1b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_and_b32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x1b,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_and_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x1b,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -227,6 +239,9 @@
 # GFX12: v_ashrrev_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x1a,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_ashrrev_i32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x1a,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_ashrrev_i32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x1a,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -270,6 +285,10 @@
 # W64: v_cndmask_b32_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x01,0xd5,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x01,0xd5,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff
 
+# W32: v_cndmask_b32_e64_dpp v5, v1, s3, s6 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x01,0xd5,0xfa,0x06,0x18,0x00,0x01,0x1b,0x00,0xff]
+# W64: v_cndmask_b32_e64_dpp v5, v1, s3, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x01,0xd5,0xfa,0x06,0x18,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x01,0xd5,0xfa,0x06,0x18,0x00,0x01,0x1b,0x00,0xff
+
 # W32: v_cndmask_b32_e64_dpp v5, v1, v2, s6 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x01,0xd5,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cndmask_b32_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x01,0xd5,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x01,0xd5,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff
@@ -324,6 +343,9 @@
 # GFX12: v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x2f,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x2f,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x2f,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -366,6 +388,9 @@
 # GFX12: v_ldexp_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_ldexp_f16_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x3b,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_ldexp_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -390,6 +415,9 @@
 # GFX12: v_ldexp_f16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+# GFX12: v_ldexp_f16_e64_dpp v5, v1, 2.0 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x21,0x01,0xff]
+0x05,0x00,0x3b,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x21,0x01,0xff
+
 # GFX12: v_ldexp_f16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
@@ -408,6 +436,9 @@
 # GFX12: v_lshlrev_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x18,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_lshlrev_b32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x18,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_lshlrev_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x18,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -450,6 +481,9 @@
 # GFX12: v_lshrrev_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x19,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x19,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_lshrrev_b32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x19,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x19,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_lshrrev_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x19,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x19,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -492,6 +526,9 @@
 # GFX12: v_max_num_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x31,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_max_num_f16_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x31,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_max_num_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x31,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -534,6 +571,9 @@
 # GFX12: v_max_num_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x16,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_max_num_f32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x16,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_max_num_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x16,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -576,6 +616,9 @@
 # GFX12: v_max_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x12,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_max_i32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x12,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_max_i32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x12,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -618,6 +661,9 @@
 # GFX12: v_max_u32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x14,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_max_u32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x14,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_max_u32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x14,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -660,6 +706,9 @@
 # GFX12: v_min_num_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x30,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_min_num_f16_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x30,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_min_num_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x30,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -702,6 +751,9 @@
 # GFX12: v_min_num_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x15,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_min_num_f32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x15,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_min_num_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x15,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -744,6 +796,9 @@
 # GFX12: v_min_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x11,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_min_i32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x11,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_min_i32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x11,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -786,6 +841,9 @@
 # GFX12: v_min_u32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x13,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_min_u32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x13,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_min_u32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x13,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -828,6 +886,9 @@
 # GFX12: v_mul_dx9_zero_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x07,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x07,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mul_dx9_zero_f32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x07,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x07,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mul_dx9_zero_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x07,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x07,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -870,6 +931,9 @@
 # GFX12: v_mul_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x35,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x35,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mul_f16_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x35,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x35,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mul_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x35,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x35,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -912,6 +976,9 @@
 # GFX12: v_mul_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x08,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mul_f32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x08,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mul_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x08,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -954,6 +1021,9 @@
 # GFX12: v_mul_hi_i32_i24_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0a,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mul_hi_i32_i24_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x0a,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mul_hi_i32_i24_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0a,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -996,6 +1066,9 @@
 # GFX12: v_mul_hi_u32_u24_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0c,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mul_hi_u32_u24_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x0c,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mul_hi_u32_u24_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0c,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -1038,6 +1111,9 @@
 # GFX12: v_mul_i32_i24_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x09,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mul_i32_i24_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x09,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mul_i32_i24_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x09,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -1080,6 +1156,9 @@
 # GFX12: v_mul_u32_u24_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mul_u32_u24_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x0b,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mul_u32_u24_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0b,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -1122,6 +1201,9 @@
 # GFX12: v_or_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1c,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x1c,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_or_b32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1c,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x1c,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_or_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1c,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x1c,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -1219,6 +1301,9 @@
 # GFX12: v_sub_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x33,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_sub_f16_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x33,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_sub_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x33,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -1261,6 +1346,9 @@
 # GFX12: v_sub_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x04,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_sub_f32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x04,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_sub_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x04,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -1303,6 +1391,9 @@
 # GFX12: v_sub_nc_u32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x26,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_sub_nc_u32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x26,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_sub_nc_u32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x26,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -1400,6 +1491,9 @@
 # GFX12: v_subrev_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x34,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x34,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_subrev_f16_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x34,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x34,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_subrev_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x34,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x34,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -1442,6 +1536,9 @@
 # GFX12: v_subrev_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x05,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_subrev_f32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x05,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_subrev_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x05,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -1484,6 +1581,9 @@
 # GFX12: v_subrev_nc_u32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x27,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x27,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_subrev_nc_u32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x27,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x27,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_subrev_nc_u32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x27,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x27,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -1526,6 +1626,9 @@
 # GFX12: v_xnor_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x1e,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_xnor_b32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x1e,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_xnor_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x1e,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -1568,6 +1671,9 @@
 # GFX12: v_xor_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x1d,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_xor_b32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x1d,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_xor_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x1d,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp8.txt
index da7faa8..f78106e 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp8.txt
@@ -23,6 +23,9 @@
 # GFX12: v_add_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x32,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x32,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_add_f16_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x32,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x32,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_add_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x32,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x32,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -35,6 +38,9 @@
 # GFX12: v_add_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x03,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_add_f32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x03,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_add_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x03,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x03,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -47,18 +53,27 @@
 # GFX12: v_add_nc_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x25,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_add_nc_u32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x25,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_add_nc_u32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x80,0x25,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x80,0x25,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_and_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1b,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_and_b32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x1b,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_and_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x1b,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x1b,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_ashrrev_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1a,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_ashrrev_i32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x1a,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_ashrrev_i32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x1a,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x1a,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
@@ -66,6 +81,10 @@
 # W64: v_cndmask_b32_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x01,0xd5,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x01,0xd5,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cndmask_b32_e64_dpp v5, v1, s3, s6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x01,0xd5,0xe9,0x06,0x18,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cndmask_b32_e64_dpp v5, v1, s3, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x01,0xd5,0xe9,0x06,0x18,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x01,0xd5,0xe9,0x06,0x18,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cndmask_b32_e64_dpp v5, v1, v2, s104 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x01,0xd5,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
 # W64: v_cndmask_b32_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x01,0xd5,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
 0x05,0x00,0x01,0xd5,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05
@@ -84,6 +103,9 @@
 # GFX12: v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2f,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x2f,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cvt_pk_rtz_f16_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2f,0xd5,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x05,0x01,0x2f,0xd5,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -96,30 +118,48 @@
 # GFX12: v_ldexp_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3b,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x3b,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_ldexp_f16_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3b,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x3b,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_ldexp_f16_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3b,0xd5,0xe9,0x04,0x02,0x08,0x01,0x77,0x39,0x05]
 0x05,0x00,0x3b,0xd5,0xe9,0x04,0x02,0x08,0x01,0x77,0x39,0x05
 
+# GFX12: v_ldexp_f16_e64_dpp v5, v1, s3 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3b,0xd5,0xe9,0x06,0x00,0x08,0x01,0x77,0x39,0x05]
+0x05,0x00,0x3b,0xd5,0xe9,0x06,0x00,0x08,0x01,0x77,0x39,0x05
+
 # GFX12: v_ldexp_f16_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3b,0xd5,0xe9,0x04,0x02,0x10,0x01,0x77,0x39,0x05]
 0x05,0x00,0x3b,0xd5,0xe9,0x04,0x02,0x10,0x01,0x77,0x39,0x05
 
+# GFX12: v_ldexp_f16_e64_dpp v5, v1, s3 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3b,0xd5,0xe9,0x06,0x00,0x10,0x01,0x77,0x39,0x05]
+0x05,0x00,0x3b,0xd5,0xe9,0x06,0x00,0x10,0x01,0x77,0x39,0x05
+
 # GFX12: v_ldexp_f16_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0x3b,0xd5,0xea,0xfe,0x03,0x38,0xff,0x00,0x00,0x00]
 0xff,0x81,0x3b,0xd5,0xea,0xfe,0x03,0x38,0xff,0x00,0x00,0x00
 
 # GFX12: v_lshlrev_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x18,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x18,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_lshlrev_b32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x18,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x18,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_lshlrev_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x18,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x18,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_lshrrev_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x19,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x19,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_lshrrev_b32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x19,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x19,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_lshrrev_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x19,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x19,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_max_num_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x31,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x31,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_max_num_f16_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x31,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x31,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_max_num_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x31,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x31,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -132,6 +172,9 @@
 # GFX12: v_max_num_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x16,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_max_num_f32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x16,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_max_num_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x16,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x16,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -144,18 +187,27 @@
 # GFX12: v_max_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x12,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_max_i32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x12,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_max_i32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x12,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x12,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_max_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x14,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x14,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_max_u32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x14,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x14,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_max_u32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x14,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x14,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_min_num_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x30,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_min_num_f16_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x30,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_min_num_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x30,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x30,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -168,6 +220,9 @@
 # GFX12: v_min_num_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x15,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_min_num_f32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x15,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_min_num_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x15,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x15,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -180,18 +235,27 @@
 # GFX12: v_min_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x11,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_min_i32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x11,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_min_i32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x11,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x11,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_min_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x13,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_min_u32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x13,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_min_u32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x13,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x13,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_mul_dx9_zero_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x07,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x07,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_mul_dx9_zero_f32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x07,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x07,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_mul_dx9_zero_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x07,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x07,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -204,6 +268,9 @@
 # GFX12: v_mul_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x35,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x35,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_mul_f16_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x35,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x35,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_mul_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x35,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x35,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -216,6 +283,9 @@
 # GFX12: v_mul_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x08,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x08,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_mul_f32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x08,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x08,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_mul_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x08,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x08,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -228,30 +298,45 @@
 # GFX12: v_mul_hi_i32_i24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0a,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_mul_hi_i32_i24_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x0a,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_mul_hi_i32_i24_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0a,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x0a,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_mul_hi_u32_u24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0c,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_mul_hi_u32_u24_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x0c,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_mul_hi_u32_u24_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0c,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x0c,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_mul_i32_i24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x09,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_mul_i32_i24_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x09,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_mul_i32_i24_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x80,0x09,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x80,0x09,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_mul_u32_u24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0b,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_mul_u32_u24_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x0b,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_mul_u32_u24_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x80,0x0b,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x80,0x0b,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_or_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1c,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1c,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_or_b32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1c,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x1c,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_or_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x1c,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x1c,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
@@ -277,6 +362,9 @@
 # GFX12: v_sub_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x33,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x33,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_sub_f16_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x33,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x33,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_sub_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x33,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x33,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -289,6 +377,9 @@
 # GFX12: v_sub_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x04,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_sub_f32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x04,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_sub_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x04,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x04,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -301,6 +392,9 @@
 # GFX12: v_sub_nc_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x26,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_sub_nc_u32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x26,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_sub_nc_u32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x80,0x26,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x80,0x26,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
@@ -326,6 +420,9 @@
 # GFX12: v_subrev_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x34,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x34,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_subrev_f16_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x34,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x34,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_subrev_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x34,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x34,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -338,6 +435,9 @@
 # GFX12: v_subrev_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x05,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_subrev_f32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x05,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_subrev_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x05,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x05,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -350,17 +450,26 @@
 # GFX12: v_subrev_nc_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x27,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x27,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_subrev_nc_u32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x27,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x27,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_subrev_nc_u32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x80,0x27,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x80,0x27,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_xnor_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1e,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_xnor_b32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x1e,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_xnor_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x1e,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x1e,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_xor_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1d,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_xor_b32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x1d,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_xor_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x1d,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x1d,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp16.txt
index e6ea6da..13e34ca 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp16.txt
@@ -21,6 +21,10 @@
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_class_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_class_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x7d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -76,6 +80,10 @@
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_class_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_class_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x7e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -131,6 +139,10 @@
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_eq_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x02,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -186,6 +198,10 @@
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_eq_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x12,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -241,6 +257,10 @@
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_eq_i16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x32,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -296,6 +316,10 @@
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_eq_i32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x42,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -351,6 +375,10 @@
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_eq_u16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x3a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -406,6 +434,10 @@
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_eq_u32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x4a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -461,6 +493,10 @@
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ge_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x06,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -516,6 +552,10 @@
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ge_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x16,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -571,6 +611,10 @@
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ge_i16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x36,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -606,6 +650,9 @@
 # GFX12: v_cmp_ge_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x36,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x36,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
+# GFX12: v_cmp_ge_i16_e64_dpp null, v255, 10 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x36,0xd4,0xfa,0x14,0x01,0x00,0xff,0x6f,0x0d,0x30]
+0x7c,0x00,0x36,0xd4,0xfa,0x14,0x01,0x00,0xff,0x6f,0x0d,0x30
+
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
@@ -626,6 +673,10 @@
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ge_i32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x46,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -681,6 +732,10 @@
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ge_u16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x3e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -736,6 +791,10 @@
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ge_u32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x4e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -791,6 +850,10 @@
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_gt_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x04,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -846,6 +909,10 @@
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_gt_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x14,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -901,6 +968,10 @@
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_gt_i16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x34,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -956,6 +1027,10 @@
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_gt_i32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x44,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1011,6 +1086,10 @@
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_gt_u16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x3c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1066,6 +1145,10 @@
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_gt_u32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x4c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1121,6 +1204,10 @@
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_le_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_le_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x03,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1176,6 +1263,10 @@
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_le_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_le_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x13,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1231,6 +1322,10 @@
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_le_i16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_le_i16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x33,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1286,6 +1381,10 @@
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_le_i32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_le_i32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x43,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1341,6 +1440,10 @@
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_le_u16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_le_u16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x3b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1396,6 +1499,10 @@
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_le_u32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_le_u32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x4b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1451,6 +1558,10 @@
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_lg_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x05,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1506,6 +1617,10 @@
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_lg_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x15,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1561,6 +1676,10 @@
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_lt_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x01,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1616,6 +1735,10 @@
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_lt_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x11,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1671,6 +1794,10 @@
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_lt_i16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x31,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1726,6 +1853,10 @@
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_lt_i32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x41,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1781,6 +1912,10 @@
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_lt_u16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x39,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1836,6 +1971,10 @@
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_lt_u32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x49,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1891,6 +2030,10 @@
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ne_i16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x35,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1946,6 +2089,10 @@
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ne_i32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x45,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2001,6 +2148,10 @@
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ne_u16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x3d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2056,6 +2207,10 @@
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ne_u32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x4d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2111,6 +2266,10 @@
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_neq_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x0d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2166,6 +2325,10 @@
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_neq_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x1d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2221,6 +2384,10 @@
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_nge_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x09,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2276,6 +2443,10 @@
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_nge_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x19,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2331,6 +2502,10 @@
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ngt_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x0b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2386,6 +2561,10 @@
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ngt_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x1b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2441,6 +2620,10 @@
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_nle_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x0c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2496,6 +2679,10 @@
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_nle_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x1c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2551,6 +2738,10 @@
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_nlg_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x0a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2606,6 +2797,10 @@
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_nlg_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x1a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2661,6 +2856,10 @@
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_nlt_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x0e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2716,6 +2915,10 @@
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_nlt_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x1e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2771,6 +2974,10 @@
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_o_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_o_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x07,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2826,6 +3033,10 @@
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_o_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_o_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x17,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2850,6 +3061,10 @@
 # W64: v_cmp_o_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x68,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+# W32: v_cmp_o_f32_e64_dpp s104, v1, 2.0 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x17,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x50,0x01,0xff]
+# W64: v_cmp_o_f32_e64_dpp s[104:105], v1, 2.0 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x17,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x50,0x01,0xff]
+0x68,0x00,0x17,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x50,0x01,0xff
+
 # W32: v_cmp_o_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x17,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_o_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x17,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x6a,0x01,0x17,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
@@ -2881,6 +3096,10 @@
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_u_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_u_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x08,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2936,6 +3155,10 @@
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_u_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_u_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x18,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp8.txt
index 98f8fd9..f36857b 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp8.txt
@@ -5,6 +5,14 @@
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_class_f16_e64_dpp s10, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_class_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x7d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05
+
+# W32: v_cmp_class_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_class_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x7d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_class_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -24,6 +32,10 @@
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_class_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_class_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x7e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_class_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -43,6 +55,10 @@
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_eq_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x02,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x02,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x02,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_eq_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -62,6 +78,10 @@
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_eq_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x12,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x12,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x12,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_eq_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -81,6 +101,10 @@
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_eq_i16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x32,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x32,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x32,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_eq_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -100,6 +124,10 @@
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_eq_i32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x42,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x42,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x42,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_eq_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -119,6 +147,10 @@
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_eq_u16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x3a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_eq_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -138,6 +170,10 @@
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_eq_u32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x4a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_eq_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -157,6 +193,10 @@
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ge_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x06,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x06,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x06,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ge_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -176,6 +216,10 @@
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ge_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x16,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x16,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x16,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ge_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -184,6 +228,10 @@
 # W64: v_cmp_ge_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x16,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x6a,0x01,0x16,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ge_f32_e64_dpp vcc_lo, |v1|, -2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x16,0xd4,0xe9,0xea,0x01,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ge_f32_e64_dpp vcc, |v1|, -2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x16,0xd4,0xe9,0xea,0x01,0x00,0x01,0x77,0x39,0x05]
+0x6a,0x01,0x16,0xd4,0xe9,0xea,0x01,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ge_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x16,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x16,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7a,0x02,0x16,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
@@ -195,6 +243,10 @@
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ge_i16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x36,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x36,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x36,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ge_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -214,6 +266,10 @@
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ge_i32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x46,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x46,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x46,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ge_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -233,6 +289,10 @@
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ge_u16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x3e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ge_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -252,6 +312,14 @@
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ge_u32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x4e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
+# W32: v_cmp_ge_u32_e64_dpp s10, v1, 10 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4e,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4e,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x4e,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ge_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -271,6 +339,10 @@
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_gt_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x04,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x04,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x04,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_gt_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -290,6 +362,10 @@
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_gt_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x14,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x14,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x14,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_gt_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -309,6 +385,10 @@
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_gt_i16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x34,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x34,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x34,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_gt_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -328,6 +408,10 @@
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_gt_i32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x44,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x44,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x44,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_gt_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -347,6 +431,10 @@
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_gt_u16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x3c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_gt_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -366,6 +454,10 @@
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_gt_u32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x4c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_gt_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -385,6 +477,10 @@
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_le_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x03,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_le_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x03,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x03,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_le_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -404,6 +500,10 @@
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_le_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x13,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_le_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x13,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x13,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_le_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -423,6 +523,10 @@
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_le_i16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x33,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_le_i16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x33,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x33,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_le_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -442,6 +546,10 @@
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_le_i32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x43,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_le_i32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x43,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x43,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_le_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -461,6 +569,10 @@
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_le_u16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_le_u16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x3b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_le_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -480,6 +592,10 @@
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_le_u32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_le_u32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x4b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_le_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -499,6 +615,10 @@
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_lg_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x05,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x05,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x05,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_lg_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -518,6 +638,10 @@
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_lg_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x15,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x15,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x15,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_lg_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -537,6 +661,10 @@
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_lt_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x01,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x01,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x01,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_lt_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -556,6 +684,10 @@
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_lt_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x11,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x11,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x11,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_lt_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -575,6 +707,10 @@
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_lt_i16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x31,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x31,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x31,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_lt_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -594,6 +730,10 @@
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_lt_i32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x41,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x41,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x41,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_lt_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -613,6 +753,10 @@
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_lt_u16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x39,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x39,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x39,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_lt_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -632,6 +776,10 @@
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_lt_u32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x49,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x49,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x49,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_lt_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -651,6 +799,10 @@
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ne_i16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x35,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x35,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x35,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ne_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -670,6 +822,10 @@
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ne_i32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x45,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x45,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x45,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ne_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -689,6 +845,10 @@
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ne_u16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x3d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ne_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -708,6 +868,10 @@
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ne_u32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x4d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ne_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -727,6 +891,10 @@
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_neq_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x0d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_neq_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -746,6 +914,10 @@
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_neq_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x1d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_neq_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -765,6 +937,10 @@
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_nge_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x09,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x09,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x09,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_nge_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -784,6 +960,10 @@
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_nge_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x19,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x19,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x19,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_nge_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -803,6 +983,10 @@
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ngt_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x0b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ngt_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -822,6 +1006,10 @@
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ngt_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x1b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ngt_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -841,6 +1029,10 @@
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_nle_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x0c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_nle_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -860,6 +1052,10 @@
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_nle_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x1c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_nle_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -879,6 +1075,10 @@
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_nlg_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x0a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_nlg_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -898,10 +1098,18 @@
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_nlg_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x1a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_nlg_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_nlg_f32_e64_dpp s104, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1a,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_nlg_f32_e64_dpp s[104:105], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1a,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+0x68,0x00,0x1a,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_nlg_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x6a,0x01,0x1a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
@@ -917,6 +1125,10 @@
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_nlt_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x0e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_nlt_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -936,6 +1148,10 @@
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_nlt_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x1e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_nlt_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -955,6 +1171,10 @@
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_o_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x07,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_o_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x07,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x07,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_o_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -974,6 +1194,10 @@
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_o_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x17,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_o_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x17,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x17,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_o_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -993,6 +1217,10 @@
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_u_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x08,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_u_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x08,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x08,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_u_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -1012,6 +1240,10 @@
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_u_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x18,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_u_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x18,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x18,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_u_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt
index eb7675f..0f933f0 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt
@@ -31,6 +31,9 @@
 # GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_class_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xfd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -73,6 +76,9 @@
 # GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_class_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xfe,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -115,6 +121,9 @@
 # GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_eq_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x82,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -157,6 +166,9 @@
 # GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_eq_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x92,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -199,6 +211,9 @@
 # GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_eq_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xb2,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -241,6 +256,9 @@
 # GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xc2,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -250,6 +268,9 @@
 # GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, 10 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x14,0x01,0x00,0x01,0x60,0x01,0x13]
+0x7e,0x00,0xc2,0xd4,0xfa,0x14,0x01,0x00,0x01,0x60,0x01,0x13
+
 # GFX12: v_cmpx_eq_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xc2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
@@ -283,6 +304,9 @@
 # GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_eq_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xba,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -325,6 +349,9 @@
 # GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_eq_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xca,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -367,6 +394,9 @@
 # GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ge_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x86,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -409,6 +439,9 @@
 # GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ge_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x96,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -451,6 +484,9 @@
 # GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ge_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xb6,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -493,6 +529,9 @@
 # GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ge_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xc6,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -535,6 +574,9 @@
 # GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ge_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xbe,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -577,6 +619,9 @@
 # GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ge_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xce,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -619,6 +664,9 @@
 # GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_gt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x84,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -661,6 +709,9 @@
 # GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_gt_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x94,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -703,6 +754,9 @@
 # GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_gt_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xb4,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -745,6 +799,9 @@
 # GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_gt_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xc4,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -787,6 +844,9 @@
 # GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_gt_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xbc,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -829,6 +889,9 @@
 # GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_gt_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xcc,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -871,6 +934,9 @@
 # GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_le_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x83,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -913,6 +979,9 @@
 # GFX12: v_cmpx_le_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_le_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x93,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_le_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -955,6 +1024,9 @@
 # GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_le_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xb3,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -997,6 +1069,9 @@
 # GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_le_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xc3,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1039,6 +1114,9 @@
 # GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_le_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xbb,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1081,6 +1159,9 @@
 # GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_le_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xcb,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1123,6 +1204,9 @@
 # GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_lg_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x85,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1165,6 +1249,9 @@
 # GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_lg_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x95,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1207,6 +1294,9 @@
 # GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_lt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x81,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1249,6 +1339,9 @@
 # GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_lt_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x91,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1291,6 +1384,9 @@
 # GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_lt_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xb1,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1333,6 +1429,9 @@
 # GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_lt_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xc1,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1375,6 +1474,9 @@
 # GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_lt_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xb9,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1417,6 +1519,9 @@
 # GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_lt_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xc9,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1459,6 +1564,9 @@
 # GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ne_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xb5,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1501,6 +1609,9 @@
 # GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xc5,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1543,6 +1654,9 @@
 # GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ne_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xbd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1585,6 +1699,9 @@
 # GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ne_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xcd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1627,6 +1744,9 @@
 # GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_neq_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x8d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1669,6 +1789,9 @@
 # GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_neq_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x9d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1711,6 +1834,9 @@
 # GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_nge_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x89,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1753,6 +1879,9 @@
 # GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_nge_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x99,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1795,6 +1924,9 @@
 # GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ngt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x8b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1837,6 +1969,9 @@
 # GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ngt_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x9b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1879,6 +2014,9 @@
 # GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_nle_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x8c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1921,6 +2059,9 @@
 # GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_nle_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x9c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1963,6 +2104,9 @@
 # GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_nlg_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x8a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -2005,6 +2149,9 @@
 # GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_nlg_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x9a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -2047,6 +2194,9 @@
 # GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_nlt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x8e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -2089,6 +2239,9 @@
 # GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_nlt_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x9e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -2131,6 +2284,9 @@
 # GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_o_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x87,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -2173,6 +2329,9 @@
 # GFX12: v_cmpx_o_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_o_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x97,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_o_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -2215,6 +2374,9 @@
 # GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_u_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x88,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -2257,6 +2419,12 @@
 # GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_u_f32_e64_dpp v1, 2.0 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x98,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x2f,0x01,0xff
+
+# GFX12: v_cmpx_u_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x98,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt
index d5e112e..bf4f971 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt
@@ -4,18 +4,27 @@
 # GFX12: v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_class_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xfd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_class_f16_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 0x7e,0x01,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_class_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xfe,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_class_f32_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfe,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 0x7e,0x01,0xfe,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_eq_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x82,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -28,6 +37,12 @@
 # GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x92,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x92,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_eq_f32_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x92,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x92,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05
+
+# GFX12: v_cmpx_eq_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x92,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x92,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_eq_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x92,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x92,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -40,30 +55,48 @@
 # GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_eq_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xb2,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_eq_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc2,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xc2,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_eq_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_eq_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xba,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_eq_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xca,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xca,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_eq_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xca,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xca,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_eq_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xca,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xca,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ge_f16_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x86,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05
+
+# GFX12: v_cmpx_ge_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x86,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -76,6 +109,9 @@
 # GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x96,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x96,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ge_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x96,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x96,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ge_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x96,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x96,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -88,30 +124,45 @@
 # GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ge_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xb6,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ge_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ge_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc6,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xc6,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ge_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ge_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xbe,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ge_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xce,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xce,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ge_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xce,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xce,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ge_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xce,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xce,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_gt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x84,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -124,6 +175,9 @@
 # GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x94,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x94,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_gt_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x94,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x94,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_gt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x94,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x94,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -136,30 +190,45 @@
 # GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_gt_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xb4,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_gt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_gt_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc4,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xc4,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_gt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_gt_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xbc,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_gt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xcc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_gt_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcc,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xcc,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_gt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xcc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_le_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_le_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x83,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -172,6 +241,9 @@
 # GFX12: v_cmpx_le_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x93,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x93,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_le_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x93,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x93,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_le_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x93,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x93,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -184,30 +256,45 @@
 # GFX12: v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_le_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xb3,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_le_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_le_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_le_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc3,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xc3,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_le_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_le_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xbb,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_le_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_le_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xcb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_le_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcb,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xcb,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_le_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xcb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_lg_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x85,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -220,6 +307,9 @@
 # GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x95,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x95,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_lg_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x95,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x95,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_lg_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x95,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x95,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -232,6 +322,9 @@
 # GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_lt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x81,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_lt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -244,6 +337,9 @@
 # GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x91,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x91,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_lt_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x91,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x91,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_lt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x91,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x91,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -256,54 +352,84 @@
 # GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_lt_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xb1,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_lt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_lt_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc1,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xc1,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_lt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_lt_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xb9,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_lt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_lt_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc9,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xc9,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_lt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ne_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xb5,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ne_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc5,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xc5,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc5,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xc5,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ne_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ne_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xbd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ne_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xcd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ne_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xcd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ne_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xcd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_neq_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x8d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -316,6 +442,9 @@
 # GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_neq_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x9d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_neq_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x9d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -328,6 +457,12 @@
 # GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_nge_f16_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x89,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05
+
+# GFX12: v_cmpx_nge_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x89,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -340,6 +475,9 @@
 # GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x99,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x99,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_nge_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x99,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x99,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_nge_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x99,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x99,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -352,6 +490,9 @@
 # GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ngt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x8b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -364,6 +505,9 @@
 # GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ngt_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x9b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ngt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x9b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -376,6 +520,9 @@
 # GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_nle_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x8c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -388,6 +535,9 @@
 # GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_nle_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x9c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_nle_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x9c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -400,6 +550,9 @@
 # GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_nlg_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x8a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -412,6 +565,9 @@
 # GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_nlg_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x9a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_nlg_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x9a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -424,6 +580,9 @@
 # GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_nlt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x8e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -436,6 +595,9 @@
 # GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_nlt_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x9e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_nlt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x9e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -448,6 +610,9 @@
 # GFX12: v_cmpx_o_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_o_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x87,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -460,6 +625,9 @@
 # GFX12: v_cmpx_o_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x97,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x97,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_o_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x97,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x97,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_o_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x97,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x97,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -472,6 +640,9 @@
 # GFX12: v_cmpx_u_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_u_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x88,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -484,6 +655,9 @@
 # GFX12: v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_u_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x98,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_u_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x98,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x98,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
diff --git a/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6-el.txt b/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6-el.txt
index 1a73178..d6f10e9 100644
--- a/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6-el.txt
+++ b/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6-el.txt
@@ -116,14 +116,14 @@
 0x10 0x08 0x02 0x46 # CHECK: sel.s $f0, $f1, $f2
 0x35 0x10 0x64 0x00 # CHECK: seleqz $2, $3, $4
 0x37 0x10 0x64 0x00 # CHECK: selnez $2, $3, $4
-0x1d 0x10 0x04 0x46 # CHECK: max.s $f0, $f2, $f4
-0x1d 0x10 0x24 0x46 # CHECK: max.d $f0, $f2, $f4
 0x1c 0x10 0x04 0x46 # CHECK: min.s $f0, $f2, $f4
 0x1c 0x10 0x24 0x46 # CHECK: min.d $f0, $f2, $f4
+0x1d 0x10 0x04 0x46 # CHECK: mina.s $f0, $f2, $f4
+0x1d 0x10 0x24 0x46 # CHECK: mina.d $f0, $f2, $f4
+0x1e 0x10 0x04 0x46 # CHECK: max.s $f0, $f2, $f4
+0x1e 0x10 0x24 0x46 # CHECK: max.d $f0, $f2, $f4
 0x1f 0x10 0x04 0x46 # CHECK: maxa.s $f0, $f2, $f4
 0x1f 0x10 0x24 0x46 # CHECK: maxa.d $f0, $f2, $f4
-0x1e 0x10 0x04 0x46 # CHECK: mina.s $f0, $f2, $f4
-0x1e 0x10 0x24 0x46 # CHECK: mina.d $f0, $f2, $f4
 0x04 0x00 0x42 0x34 # CHECK: ori $2, $2, 4
 0x14 0x10 0x04 0x46 # CHECK: seleqz.s $f0, $f2, $f4
 0x14 0x10 0x24 0x46 # CHECK: seleqz.d $f0, $f2, $f4
diff --git a/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt b/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt
index 53ea025..e1ba009 100644
--- a/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt
+++ b/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt
@@ -92,8 +92,8 @@
 0x46 0x04 0x10 0x14 # CHECK: seleqz.s $f0, $f2, $f4
 0x46 0x04 0x10 0x17 # CHECK: selnez.s $f0, $f2, $f4
 0x46 0x04 0x10 0x1c # CHECK: min.s $f0, $f2, $f4
-0x46 0x04 0x10 0x1d # CHECK: max.s $f0, $f2, $f4
-0x46 0x04 0x10 0x1e # CHECK: mina.s $f0, $f2, $f4
+0x46 0x04 0x10 0x1d # CHECK: mina.s $f0, $f2, $f4
+0x46 0x04 0x10 0x1e # CHECK: max.s $f0, $f2, $f4
 0x46 0x04 0x10 0x1f # CHECK: maxa.s $f0, $f2, $f4
 0x46 0x04 0x18 0x98 # CHECK: maddf.s $f2, $f3, $f4
 0x46 0x04 0x18 0x99 # CHECK: msubf.s $f2, $f3, $f4
@@ -103,8 +103,8 @@
 0x46 0x24 0x10 0x14 # CHECK: seleqz.d $f0, $f2, $f4
 0x46 0x24 0x10 0x17 # CHECK: selnez.d $f0, $f2, $f4
 0x46 0x24 0x10 0x1c # CHECK: min.d $f0, $f2, $f4
-0x46 0x24 0x10 0x1d # CHECK: max.d $f0, $f2, $f4
-0x46 0x24 0x10 0x1e # CHECK: mina.d $f0, $f2, $f4
+0x46 0x24 0x10 0x1d # CHECK: mina.d $f0, $f2, $f4
+0x46 0x24 0x10 0x1e # CHECK: max.d $f0, $f2, $f4
 0x46 0x24 0x10 0x1f # CHECK: maxa.d $f0, $f2, $f4
 0x46 0x24 0x18 0x98 # CHECK: maddf.d $f2, $f3, $f4
 0x46 0x24 0x18 0x99 # CHECK: msubf.d $f2, $f3, $f4
diff --git a/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6-el.txt b/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6-el.txt
index 9aeea45..a7dfbd2 100644
--- a/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6-el.txt
+++ b/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6-el.txt
@@ -140,15 +140,15 @@
 0x43 0x00 0x50 0xec # CHECK: lwupc $2, 268
 0x98 0x18 0x24 0x46 # CHECK: maddf.d $f2, $f3, $f4
 0x98 0x18 0x04 0x46 # CHECK: maddf.s $f2, $f3, $f4
-0x1d 0x10 0x24 0x46 # CHECK: max.d $f0, $f2, $f4
-0x1d 0x10 0x04 0x46 # CHECK: max.s $f0, $f2, $f4
+0x1e 0x10 0x24 0x46 # CHECK: max.d $f0, $f2, $f4
+0x1e 0x10 0x04 0x46 # CHECK: max.s $f0, $f2, $f4
 0x1f 0x10 0x24 0x46 # CHECK: maxa.d $f0, $f2, $f4
 0x1f 0x10 0x04 0x46 # CHECK: maxa.s $f0, $f2, $f4
 0x01 0x78 0x08 0x40 # CHECK: mfc0 $8, $15, 1
 0x1c 0x10 0x24 0x46 # CHECK: min.d $f0, $f2, $f4
 0x1c 0x10 0x04 0x46 # CHECK: min.s $f0, $f2, $f4
-0x1e 0x10 0x24 0x46 # CHECK: mina.d $f0, $f2, $f4
-0x1e 0x10 0x04 0x46 # CHECK: mina.s $f0, $f2, $f4
+0x1d 0x10 0x24 0x46 # CHECK: mina.d $f0, $f2, $f4
+0x1d 0x10 0x04 0x46 # CHECK: mina.s $f0, $f2, $f4
 0xda 0x10 0x64 0x00 # CHECK: mod $2, $3, $4
 0xdb 0x10 0x64 0x00 # CHECK: modu $2, $3, $4
 0x25 0x78 0xe0 0x03 # CHECK: move $15, $ra
diff --git a/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt b/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt
index 32b91c6..0030e51 100644
--- a/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt
+++ b/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt
@@ -111,8 +111,8 @@
 0x46 0x04 0x10 0x14 # CHECK: seleqz.s $f0, $f2, $f4
 0x46 0x04 0x10 0x17 # CHECK: selnez.s $f0, $f2, $f4
 0x46 0x04 0x10 0x1c # CHECK: min.s $f0, $f2, $f4
-0x46 0x04 0x10 0x1d # CHECK: max.s $f0, $f2, $f4
-0x46 0x04 0x10 0x1e # CHECK: mina.s $f0, $f2, $f4
+0x46 0x04 0x10 0x1d # CHECK: mina.s $f0, $f2, $f4
+0x46 0x04 0x10 0x1e # CHECK: max.s $f0, $f2, $f4
 0x46 0x04 0x10 0x1f # CHECK: maxa.s $f0, $f2, $f4
 0x46 0x04 0x18 0x98 # CHECK: maddf.s $f2, $f3, $f4
 0x46 0x04 0x18 0x99 # CHECK: msubf.s $f2, $f3, $f4
@@ -122,8 +122,8 @@
 0x46 0x24 0x10 0x14 # CHECK: seleqz.d $f0, $f2, $f4
 0x46 0x24 0x10 0x17 # CHECK: selnez.d $f0, $f2, $f4
 0x46 0x24 0x10 0x1c # CHECK: min.d $f0, $f2, $f4
-0x46 0x24 0x10 0x1d # CHECK: max.d $f0, $f2, $f4
-0x46 0x24 0x10 0x1e # CHECK: mina.d $f0, $f2, $f4
+0x46 0x24 0x10 0x1d # CHECK: mina.d $f0, $f2, $f4
+0x46 0x24 0x10 0x1e # CHECK: max.d $f0, $f2, $f4
 0x46 0x24 0x10 0x1f # CHECK: maxa.d $f0, $f2, $f4
 0x46 0x24 0x18 0x98 # CHECK: maddf.d $f2, $f3, $f4
 0x46 0x24 0x18 0x99 # CHECK: msubf.d $f2, $f3, $f4
diff --git a/llvm/test/MC/Mips/mips32r6/valid.s b/llvm/test/MC/Mips/mips32r6/valid.s
index 0f098a1..0d705b6 100644
--- a/llvm/test/MC/Mips/mips32r6/valid.s
+++ b/llvm/test/MC/Mips/mips32r6/valid.s
@@ -170,14 +170,14 @@ a:
         sel.s   $f0,$f1,$f2      # CHECK: sel.s $f0, $f1, $f2 # encoding: [0x46,0x02,0x08,0x10]
         seleqz  $2,$3,$4         # CHECK: seleqz $2, $3, $4 # encoding: [0x00,0x64,0x10,0x35]
         selnez  $2,$3,$4         # CHECK: selnez $2, $3, $4 # encoding: [0x00,0x64,0x10,0x37]
-        max.s   $f0, $f2, $f4    # CHECK: max.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1d]
-        max.d   $f0, $f2, $f4    # CHECK: max.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1d]
+        max.s   $f0, $f2, $f4    # CHECK: max.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1e]
+        max.d   $f0, $f2, $f4    # CHECK: max.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1e]
         min.s   $f0, $f2, $f4    # CHECK: min.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1c]
         min.d   $f0, $f2, $f4    # CHECK: min.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1c]
         maxa.s  $f0, $f2, $f4    # CHECK: maxa.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1f]
         maxa.d  $f0, $f2, $f4    # CHECK: maxa.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1f]
-        mina.s  $f0, $f2, $f4    # CHECK: mina.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1e]
-        mina.d  $f0, $f2, $f4    # CHECK: mina.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1e]
+        mina.s  $f0, $f2, $f4    # CHECK: mina.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1d]
+        mina.d  $f0, $f2, $f4    # CHECK: mina.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1d]
         or      $2, 4            # CHECK: ori $2, $2, 4          # encoding: [0x34,0x42,0x00,0x04]
         seleqz.s $f0, $f2, $f4   # CHECK: seleqz.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x14]
         seleqz.d $f0, $f2, $f4   # CHECK: seleqz.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x14]
diff --git a/llvm/test/MC/Mips/mips64r6/valid.s b/llvm/test/MC/Mips/mips64r6/valid.s
index c50bd9e..ff6e1d7 100644
--- a/llvm/test/MC/Mips/mips64r6/valid.s
+++ b/llvm/test/MC/Mips/mips64r6/valid.s
@@ -183,14 +183,14 @@ a:
         lwupc   $2,268           # CHECK: lwupc $2, 268    # encoding: [0xec,0x50,0x00,0x43]
         maddf.d $f2,$f3,$f4      # CHECK: maddf.d $f2, $f3, $f4  # encoding: [0x46,0x24,0x18,0x98]
         maddf.s $f2,$f3,$f4      # CHECK: maddf.s $f2, $f3, $f4  # encoding: [0x46,0x04,0x18,0x98]
-        max.d   $f0, $f2, $f4    # CHECK: max.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1d]
-        max.s   $f0, $f2, $f4    # CHECK: max.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1d]
+        max.d   $f0, $f2, $f4    # CHECK: max.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1e]
+        max.s   $f0, $f2, $f4    # CHECK: max.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1e]
         maxa.d  $f0, $f2, $f4    # CHECK: maxa.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1f]
         maxa.s  $f0, $f2, $f4    # CHECK: maxa.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1f]
         min.d   $f0, $f2, $f4    # CHECK: min.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1c]
         min.s   $f0, $f2, $f4    # CHECK: min.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1c]
-        mina.d  $f0, $f2, $f4    # CHECK: mina.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1e]
-        mina.s  $f0, $f2, $f4    # CHECK: mina.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1e]
+        mina.d  $f0, $f2, $f4    # CHECK: mina.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1d]
+        mina.s  $f0, $f2, $f4    # CHECK: mina.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1d]
         mfc0    $8,$15,1         # CHECK: mfc0 $8, $15, 1      # encoding: [0x40,0x08,0x78,0x01]
         mod     $2,$3,$4         # CHECK: mod $2, $3, $4   # encoding: [0x00,0x64,0x10,0xda]
         modu    $2,$3,$4         # CHECK: modu $2, $3, $4  # encoding: [0x00,0x64,0x10,0xdb]
diff --git a/llvm/test/MachineVerifier/test_g_fcmp.mir b/llvm/test/MachineVerifier/test_g_fcmp.mir
index 9a73569..17be746 100644
--- a/llvm/test/MachineVerifier/test_g_fcmp.mir
+++ b/llvm/test/MachineVerifier/test_g_fcmp.mir
@@ -24,17 +24,22 @@ body:             |
     %4:_(<2 x s32>) = G_IMPLICIT_DEF
     %5:_(s1) = G_FCMP floatpred(oeq), %3, %4
 
-    ; mismatched element count
+    ; mismatched fixed element count
     ; CHECK: Bad machine code: Generic vector icmp/fcmp must preserve number of
     %6:_(<2 x s32>) = G_IMPLICIT_DEF
     %7:_(<2 x s32>) = G_IMPLICIT_DEF
     %8:_(<4 x s1>) = G_FCMP floatpred(oeq), %6, %7
 
+    ; mismatched scalable element count
+    ; CHECK: Bad machine code: Generic vector icmp/fcmp must preserve number of
+    %9:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %10:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %11:_(<vscale x 4 x s1>) = G_FCMP floatpred(oeq), %9, %10
 
     ; mismatched scalar element type
     ; CHECK: *** Bad machine code: Type mismatch in generic instruction ***
-    %9:_(s32) = G_FCONSTANT float 0.0
-    %10:_(s64) = G_FCONSTANT float 1.0
-    %11:_(s1) = G_FCMP floatpred(oeq), %9, %10
+    %12:_(s32) = G_FCONSTANT float 0.0
+    %13:_(s64) = G_FCONSTANT float 1.0
+    %14:_(s1) = G_FCMP floatpred(oeq), %12, %13
 
 ...
diff --git a/llvm/test/MachineVerifier/test_g_icmp.mir b/llvm/test/MachineVerifier/test_g_icmp.mir
index 7c64e25..74e3d34 100644
--- a/llvm/test/MachineVerifier/test_g_icmp.mir
+++ b/llvm/test/MachineVerifier/test_g_icmp.mir
@@ -24,17 +24,22 @@ body:             |
     %4:_(<2 x s32>) = G_IMPLICIT_DEF
     %5:_(s1) = G_ICMP intpred(eq), %3, %4
 
-    ; mismatched element count
+    ; mismatched fixed element count
     ; CHECK: Bad machine code: Generic vector icmp/fcmp must preserve number of
     %6:_(<2 x s32>) = G_IMPLICIT_DEF
     %7:_(<2 x s32>) = G_IMPLICIT_DEF
     %8:_(<4 x s1>) = G_ICMP intpred(eq), %6, %7
 
+    ; mismatched scalable element count
+    ; CHECK: Bad machine code: Generic vector icmp/fcmp must preserve number of
+    %9:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %10:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %11:_(<vscale x 4 x s1>) = G_ICMP intpred(eq), %9, %10
 
     ; mismatched scalar element type
     ; CHECK: *** Bad machine code: Type mismatch in generic instruction ***
-    %9:_(s32) = G_CONSTANT i32 0
-    %10:_(s64) = G_CONSTANT i32 1
-    %11:_(s1) = G_ICMP intpred(eq), %9, %10
+    %12:_(s32) = G_CONSTANT i32 0
+    %13:_(s64) = G_CONSTANT i32 1
+    %14:_(s1) = G_ICMP intpred(eq), %12, %13
 
 ...
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index 4ab5567..493350d 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -984,10 +984,10 @@ static const X86FoldTableEntry Table1[] = {
   {X86::RORX32ri_EVEX, X86::RORX32mi_EVEX, 0},
   {X86::RORX64ri, X86::RORX64mi, 0},
   {X86::RORX64ri_EVEX, X86::RORX64mi_EVEX, 0},
-  {X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16},
-  {X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16},
-  {X86::ROUNDSDr, X86::ROUNDSDm, 0},
-  {X86::ROUNDSSr, X86::ROUNDSSm, 0},
+  {X86::ROUNDPDri, X86::ROUNDPDmi, TB_ALIGN_16},
+  {X86::ROUNDPSri, X86::ROUNDPSmi, TB_ALIGN_16},
+  {X86::ROUNDSDri, X86::ROUNDSDmi, 0},
+  {X86::ROUNDSSri, X86::ROUNDSSmi, 0},
   {X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16},
   {X86::RSQRTSSr, X86::RSQRTSSm, 0},
   {X86::SAR16r1_ND, X86::SAR16m1_ND, 0},
@@ -1791,10 +1791,10 @@ static const X86FoldTableEntry Table1[] = {
   {X86::VRNDSCALEPSZ128rri, X86::VRNDSCALEPSZ128rmi, 0},
   {X86::VRNDSCALEPSZ256rri, X86::VRNDSCALEPSZ256rmi, 0},
   {X86::VRNDSCALEPSZrri, X86::VRNDSCALEPSZrmi, 0},
-  {X86::VROUNDPDYr, X86::VROUNDPDYm, 0},
-  {X86::VROUNDPDr, X86::VROUNDPDm, 0},
-  {X86::VROUNDPSYr, X86::VROUNDPSYm, 0},
-  {X86::VROUNDPSr, X86::VROUNDPSm, 0},
+  {X86::VROUNDPDYri, X86::VROUNDPDYmi, 0},
+  {X86::VROUNDPDri, X86::VROUNDPDmi, 0},
+  {X86::VROUNDPSYri, X86::VROUNDPSYmi, 0},
+  {X86::VROUNDPSri, X86::VROUNDPSmi, 0},
   {X86::VRSQRT14PDZ128r, X86::VRSQRT14PDZ128m, 0},
   {X86::VRSQRT14PDZ256r, X86::VRSQRT14PDZ256m, 0},
   {X86::VRSQRT14PDZr, X86::VRSQRT14PDZm, 0},
@@ -2234,8 +2234,8 @@ static const X86FoldTableEntry Table2[] = {
   {X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16},
   {X86::PXORrr, X86::PXORrm, TB_ALIGN_16},
   {X86::RCPSSr_Int, X86::RCPSSm_Int, TB_NO_REVERSE},
-  {X86::ROUNDSDr_Int, X86::ROUNDSDm_Int, TB_NO_REVERSE},
-  {X86::ROUNDSSr_Int, X86::ROUNDSSm_Int, TB_NO_REVERSE},
+  {X86::ROUNDSDri_Int, X86::ROUNDSDmi_Int, TB_NO_REVERSE},
+  {X86::ROUNDSSri_Int, X86::ROUNDSSmi_Int, TB_NO_REVERSE},
   {X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, TB_NO_REVERSE},
   {X86::SBB16rr, X86::SBB16rm, 0},
   {X86::SBB16rr_ND, X86::SBB16rm_ND, 0},
@@ -3778,10 +3778,10 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VRNDSCALESHZr_Int, X86::VRNDSCALESHZm_Int, TB_NO_REVERSE},
   {X86::VRNDSCALESSZr, X86::VRNDSCALESSZm, 0},
   {X86::VRNDSCALESSZr_Int, X86::VRNDSCALESSZm_Int, TB_NO_REVERSE},
-  {X86::VROUNDSDr, X86::VROUNDSDm, 0},
-  {X86::VROUNDSDr_Int, X86::VROUNDSDm_Int, TB_NO_REVERSE},
-  {X86::VROUNDSSr, X86::VROUNDSSm, 0},
-  {X86::VROUNDSSr_Int, X86::VROUNDSSm_Int, TB_NO_REVERSE},
+  {X86::VROUNDSDri, X86::VROUNDSDmi, 0},
+  {X86::VROUNDSDri_Int, X86::VROUNDSDmi_Int, TB_NO_REVERSE},
+  {X86::VROUNDSSri, X86::VROUNDSSmi, 0},
+  {X86::VROUNDSSri_Int, X86::VROUNDSSmi_Int, TB_NO_REVERSE},
   {X86::VRSQRT14PDZ128rkz, X86::VRSQRT14PDZ128mkz, 0},
   {X86::VRSQRT14PDZ256rkz, X86::VRSQRT14PDZ256mkz, 0},
   {X86::VRSQRT14PDZrkz, X86::VRSQRT14PDZmkz, 0},
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/range.ll b/llvm/test/Transforms/CorrelatedValuePropagation/range.ll
index cc66cbe..ce1b591 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/range.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/range.ll
@@ -102,9 +102,9 @@ if.end8:
 define i32 @test4(i32 %c) nounwind {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:    switch i32 [[C:%.*]], label [[SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 1, label [[SW_BB:%.*]]
-; CHECK-NEXT:    i32 2, label [[SW_BB]]
-; CHECK-NEXT:    i32 4, label [[SW_BB]]
+; CHECK-NEXT:      i32 1, label [[SW_BB:%.*]]
+; CHECK-NEXT:      i32 2, label [[SW_BB]]
+; CHECK-NEXT:      i32 4, label [[SW_BB]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
 ; CHECK-NEXT:    br i1 true, label [[IF_THEN:%.*]], label [[IF_END:%.*]]
@@ -207,8 +207,8 @@ define i1 @test7(i32 %c) nounwind {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    switch i32 [[C:%.*]], label [[SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 6, label [[SW_BB:%.*]]
-; CHECK-NEXT:    i32 7, label [[SW_BB]]
+; CHECK-NEXT:      i32 6, label [[SW_BB:%.*]]
+; CHECK-NEXT:      i32 7, label [[SW_BB]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
 ; CHECK-NEXT:    ret i1 true
@@ -790,8 +790,8 @@ define i32 @test18(i8 %a) {
 ; CHECK-NEXT:    br label [[DISPATCH:%.*]]
 ; CHECK:       dispatch:
 ; CHECK-NEXT:    switch i8 [[A]], label [[DISPATCH]] [
-; CHECK-NEXT:    i8 93, label [[TARGET93:%.*]]
-; CHECK-NEXT:    i8 -111, label [[DISPATCH]]
+; CHECK-NEXT:      i8 93, label [[TARGET93:%.*]]
+; CHECK-NEXT:      i8 -111, label [[DISPATCH]]
 ; CHECK-NEXT:    ]
 ; CHECK:       target93:
 ; CHECK-NEXT:    ret i32 93
@@ -817,8 +817,8 @@ define i8 @test19(i8 %a) {
 ; CHECK-NEXT:    br label [[DISPATCH:%.*]]
 ; CHECK:       dispatch:
 ; CHECK-NEXT:    switch i8 [[A]], label [[DISPATCH]] [
-; CHECK-NEXT:    i8 93, label [[TARGET93:%.*]]
-; CHECK-NEXT:    i8 -111, label [[DISPATCH]]
+; CHECK-NEXT:      i8 93, label [[TARGET93:%.*]]
+; CHECK-NEXT:      i8 -111, label [[DISPATCH]]
 ; CHECK-NEXT:    ]
 ; CHECK:       target93:
 ; CHECK-NEXT:    ret i8 96
@@ -846,8 +846,8 @@ define i1 @test20(i64 %a) {
 ; CHECK-NEXT:    br label [[DISPATCH:%.*]]
 ; CHECK:       dispatch:
 ; CHECK-NEXT:    switch i64 [[A]], label [[DEFAULT:%.*]] [
-; CHECK-NEXT:    i64 0, label [[EXIT2:%.*]]
-; CHECK-NEXT:    i64 -2147483647, label [[EXIT2]]
+; CHECK-NEXT:      i64 0, label [[EXIT2:%.*]]
+; CHECK-NEXT:      i64 -2147483647, label [[EXIT2]]
 ; CHECK-NEXT:    ]
 ; CHECK:       default:
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i64 [[B]], 0
@@ -1123,6 +1123,70 @@ else:
   ret i1 true
 }
 
+define i1 @icmp_eq_range_attr(i8 range(i8 1, 0) %i) {
+; CHECK-LABEL: @icmp_eq_range_attr(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+define i1 @neg_icmp_eq_range_attr(i8 range(i8 -1, 1) %i) {
+; CHECK-LABEL: @neg_icmp_eq_range_attr(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[I:%.*]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+declare range(i8 1, 0) i8 @returns_non_zero_range_helper()
+declare range(i8 -1, 1) i8 @returns_contain_zero_range_helper()
+
+define i1 @icmp_eq_range_return() {
+; CHECK-LABEL: @icmp_eq_range_return(
+; CHECK-NEXT:    [[I:%.*]] = call i8 @returns_non_zero_range_helper()
+; CHECK-NEXT:    ret i1 false
+;
+  %i = call i8 @returns_non_zero_range_helper()
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+define i1 @neg_icmp_eq_range_return() {
+; CHECK-LABEL: @neg_icmp_eq_range_return(
+; CHECK-NEXT:    [[I:%.*]] = call i8 @returns_contain_zero_range_helper()
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[I]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i = call i8 @returns_contain_zero_range_helper()
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+declare i8 @returns_i8_helper()
+
+define i1 @icmp_eq_range_call() {
+; CHECK-LABEL: @icmp_eq_range_call(
+; CHECK-NEXT:    [[I:%.*]] = call range(i8 1, 0) i8 @returns_i8_helper()
+; CHECK-NEXT:    ret i1 false
+;
+  %i = call range(i8 1, 0) i8 @returns_i8_helper()
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+define i1 @neg_icmp_eq_range_call() {
+; CHECK-LABEL: @neg_icmp_eq_range_call(
+; CHECK-NEXT:    [[I:%.*]] = call range(i8 0, 11) i8 @returns_i8_helper()
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[I]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i = call range(i8 0, 11) i8 @returns_i8_helper()
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
 declare i16 @llvm.ctlz.i16(i16, i1)
 declare i16 @llvm.cttz.i16(i16, i1)
 declare i16 @llvm.ctpop.i16(i16)
diff --git a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptosi129.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptosi129.ll
index 75130c2..e058c5b 100644
--- a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptosi129.ll
+++ b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptosi129.ll
@@ -176,3 +176,80 @@ define i129 @fp128tosi129(fp128 %a) {
   %conv = fptosi fp128 %a to i129
   ret i129 %conv
 }
+
+define <2 x i129> @floattosi129v2(<2 x float> %a) {
+; CHECK-LABEL: @floattosi129v2(
+; CHECK-NEXT:  fp-to-i-entryfp-to-i-entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i129
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP1]], -1
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i129 1, i129 -1
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i129 [[TMP2]], 23
+; CHECK-NEXT:    [[TMP6:%.*]] = and i129 [[TMP5]], 255
+; CHECK-NEXT:    [[TMP7:%.*]] = and i129 [[TMP2]], 8388607
+; CHECK-NEXT:    [[TMP8:%.*]] = or i129 [[TMP7]], 8388608
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult i129 [[TMP6]], 127
+; CHECK-NEXT:    br i1 [[TMP9]], label [[FP_TO_I_CLEANUP1:%.*]], label [[FP_TO_I_IF_END2:%.*]]
+; CHECK:       fp-to-i-if-end2:
+; CHECK-NEXT:    [[TMP10:%.*]] = add i129 [[TMP6]], -256
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult i129 [[TMP10]], -129
+; CHECK-NEXT:    br i1 [[TMP11]], label [[FP_TO_I_IF_THEN53:%.*]], label [[FP_TO_I_IF_END94:%.*]]
+; CHECK:       fp-to-i-if-then53:
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP3]], i129 340282366920938463463374607431768211455, i129 -340282366920938463463374607431768211456
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP1]]
+; CHECK:       fp-to-i-if-end94:
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i129 [[TMP6]], 150
+; CHECK-NEXT:    br i1 [[TMP13]], label [[FP_TO_I_IF_THEN125:%.*]], label [[FP_TO_I_IF_ELSE6:%.*]]
+; CHECK:       fp-to-i-if-then125:
+; CHECK-NEXT:    [[TMP14:%.*]] = sub i129 150, [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i129 [[TMP8]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i129 [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP1]]
+; CHECK:       fp-to-i-if-else6:
+; CHECK-NEXT:    [[TMP17:%.*]] = add i129 [[TMP6]], -150
+; CHECK-NEXT:    [[TMP18:%.*]] = shl i129 [[TMP8]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i129 [[TMP18]], [[TMP4]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP1]]
+; CHECK:       fp-to-i-cleanup1:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi i129 [ [[TMP12]], [[FP_TO_I_IF_THEN53]] ], [ [[TMP16]], [[FP_TO_I_IF_THEN125]] ], [ [[TMP19]], [[FP_TO_I_IF_ELSE6]] ], [ 0, [[FP_TO_I_ENTRYFP_TO_I_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x i129> poison, i129 [[TMP20]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[A]], i64 1
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast float [[TMP22]] to i32
+; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP23]] to i129
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], -1
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i129 1, i129 -1
+; CHECK-NEXT:    [[TMP27:%.*]] = lshr i129 [[TMP24]], 23
+; CHECK-NEXT:    [[TMP28:%.*]] = and i129 [[TMP27]], 255
+; CHECK-NEXT:    [[TMP29:%.*]] = and i129 [[TMP24]], 8388607
+; CHECK-NEXT:    [[TMP30:%.*]] = or i129 [[TMP29]], 8388608
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ult i129 [[TMP28]], 127
+; CHECK-NEXT:    br i1 [[TMP31]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_END:%.*]]
+; CHECK:       fp-to-i-if-end:
+; CHECK-NEXT:    [[TMP32:%.*]] = add i129 [[TMP28]], -256
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ult i129 [[TMP32]], -129
+; CHECK-NEXT:    br i1 [[TMP33]], label [[FP_TO_I_IF_THEN5:%.*]], label [[FP_TO_I_IF_END9:%.*]]
+; CHECK:       fp-to-i-if-then5:
+; CHECK-NEXT:    [[TMP34:%.*]] = select i1 [[TMP25]], i129 340282366920938463463374607431768211455, i129 -340282366920938463463374607431768211456
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
+; CHECK:       fp-to-i-if-end9:
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ult i129 [[TMP28]], 150
+; CHECK-NEXT:    br i1 [[TMP35]], label [[FP_TO_I_IF_THEN12:%.*]], label [[FP_TO_I_IF_ELSE:%.*]]
+; CHECK:       fp-to-i-if-then12:
+; CHECK-NEXT:    [[TMP36:%.*]] = sub i129 150, [[TMP28]]
+; CHECK-NEXT:    [[TMP37:%.*]] = lshr i129 [[TMP30]], [[TMP36]]
+; CHECK-NEXT:    [[TMP38:%.*]] = mul i129 [[TMP37]], [[TMP26]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
+; CHECK:       fp-to-i-if-else:
+; CHECK-NEXT:    [[TMP39:%.*]] = add i129 [[TMP28]], -150
+; CHECK-NEXT:    [[TMP40:%.*]] = shl i129 [[TMP30]], [[TMP39]]
+; CHECK-NEXT:    [[TMP41:%.*]] = mul i129 [[TMP40]], [[TMP26]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
+; CHECK:       fp-to-i-cleanup:
+; CHECK-NEXT:    [[TMP42:%.*]] = phi i129 [ [[TMP34]], [[FP_TO_I_IF_THEN5]] ], [ [[TMP38]], [[FP_TO_I_IF_THEN12]] ], [ [[TMP41]], [[FP_TO_I_IF_ELSE]] ], [ 0, [[FP_TO_I_CLEANUP1]] ]
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x i129> [[TMP21]], i129 [[TMP42]], i64 1
+; CHECK-NEXT:    ret <2 x i129> [[TMP43]]
+;
+  %conv = fptosi <2 x float> %a to <2 x i129>
+  ret <2 x i129> %conv
+}
diff --git a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptoui129.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptoui129.ll
index ed630d7..c699f80 100644
--- a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptoui129.ll
+++ b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptoui129.ll
@@ -176,3 +176,80 @@ define i129 @fp128toui129(fp128 %a) {
   %conv = fptoui fp128 %a to i129
   ret i129 %conv
 }
+
+define <2 x i129> @floattoui129v2(<2 x float> %a) {
+; CHECK-LABEL: @floattoui129v2(
+; CHECK-NEXT:  fp-to-i-entryfp-to-i-entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i129
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP1]], -1
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i129 1, i129 -1
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i129 [[TMP2]], 23
+; CHECK-NEXT:    [[TMP6:%.*]] = and i129 [[TMP5]], 255
+; CHECK-NEXT:    [[TMP7:%.*]] = and i129 [[TMP2]], 8388607
+; CHECK-NEXT:    [[TMP8:%.*]] = or i129 [[TMP7]], 8388608
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult i129 [[TMP6]], 127
+; CHECK-NEXT:    br i1 [[TMP9]], label [[FP_TO_I_CLEANUP1:%.*]], label [[FP_TO_I_IF_END2:%.*]]
+; CHECK:       fp-to-i-if-end2:
+; CHECK-NEXT:    [[TMP10:%.*]] = add i129 [[TMP6]], -256
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult i129 [[TMP10]], -129
+; CHECK-NEXT:    br i1 [[TMP11]], label [[FP_TO_I_IF_THEN53:%.*]], label [[FP_TO_I_IF_END94:%.*]]
+; CHECK:       fp-to-i-if-then53:
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP3]], i129 340282366920938463463374607431768211455, i129 -340282366920938463463374607431768211456
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP1]]
+; CHECK:       fp-to-i-if-end94:
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i129 [[TMP6]], 150
+; CHECK-NEXT:    br i1 [[TMP13]], label [[FP_TO_I_IF_THEN125:%.*]], label [[FP_TO_I_IF_ELSE6:%.*]]
+; CHECK:       fp-to-i-if-then125:
+; CHECK-NEXT:    [[TMP14:%.*]] = sub i129 150, [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i129 [[TMP8]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i129 [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP1]]
+; CHECK:       fp-to-i-if-else6:
+; CHECK-NEXT:    [[TMP17:%.*]] = add i129 [[TMP6]], -150
+; CHECK-NEXT:    [[TMP18:%.*]] = shl i129 [[TMP8]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i129 [[TMP18]], [[TMP4]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP1]]
+; CHECK:       fp-to-i-cleanup1:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi i129 [ [[TMP12]], [[FP_TO_I_IF_THEN53]] ], [ [[TMP16]], [[FP_TO_I_IF_THEN125]] ], [ [[TMP19]], [[FP_TO_I_IF_ELSE6]] ], [ 0, [[FP_TO_I_ENTRYFP_TO_I_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x i129> poison, i129 [[TMP20]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[A]], i64 1
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast float [[TMP22]] to i32
+; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP23]] to i129
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], -1
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i129 1, i129 -1
+; CHECK-NEXT:    [[TMP27:%.*]] = lshr i129 [[TMP24]], 23
+; CHECK-NEXT:    [[TMP28:%.*]] = and i129 [[TMP27]], 255
+; CHECK-NEXT:    [[TMP29:%.*]] = and i129 [[TMP24]], 8388607
+; CHECK-NEXT:    [[TMP30:%.*]] = or i129 [[TMP29]], 8388608
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ult i129 [[TMP28]], 127
+; CHECK-NEXT:    br i1 [[TMP31]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_END:%.*]]
+; CHECK:       fp-to-i-if-end:
+; CHECK-NEXT:    [[TMP32:%.*]] = add i129 [[TMP28]], -256
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ult i129 [[TMP32]], -129
+; CHECK-NEXT:    br i1 [[TMP33]], label [[FP_TO_I_IF_THEN5:%.*]], label [[FP_TO_I_IF_END9:%.*]]
+; CHECK:       fp-to-i-if-then5:
+; CHECK-NEXT:    [[TMP34:%.*]] = select i1 [[TMP25]], i129 340282366920938463463374607431768211455, i129 -340282366920938463463374607431768211456
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
+; CHECK:       fp-to-i-if-end9:
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ult i129 [[TMP28]], 150
+; CHECK-NEXT:    br i1 [[TMP35]], label [[FP_TO_I_IF_THEN12:%.*]], label [[FP_TO_I_IF_ELSE:%.*]]
+; CHECK:       fp-to-i-if-then12:
+; CHECK-NEXT:    [[TMP36:%.*]] = sub i129 150, [[TMP28]]
+; CHECK-NEXT:    [[TMP37:%.*]] = lshr i129 [[TMP30]], [[TMP36]]
+; CHECK-NEXT:    [[TMP38:%.*]] = mul i129 [[TMP37]], [[TMP26]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
+; CHECK:       fp-to-i-if-else:
+; CHECK-NEXT:    [[TMP39:%.*]] = add i129 [[TMP28]], -150
+; CHECK-NEXT:    [[TMP40:%.*]] = shl i129 [[TMP30]], [[TMP39]]
+; CHECK-NEXT:    [[TMP41:%.*]] = mul i129 [[TMP40]], [[TMP26]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
+; CHECK:       fp-to-i-cleanup:
+; CHECK-NEXT:    [[TMP42:%.*]] = phi i129 [ [[TMP34]], [[FP_TO_I_IF_THEN5]] ], [ [[TMP38]], [[FP_TO_I_IF_THEN12]] ], [ [[TMP41]], [[FP_TO_I_IF_ELSE]] ], [ 0, [[FP_TO_I_CLEANUP1]] ]
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x i129> [[TMP21]], i129 [[TMP42]], i64 1
+; CHECK-NEXT:    ret <2 x i129> [[TMP43]]
+;
+  %conv = fptoui <2 x float> %a to <2 x i129>
+  ret <2 x i129> %conv
+}
diff --git a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll
index 76f5248..f70ce2f 100644
--- a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll
+++ b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll
@@ -426,3 +426,166 @@ define fp128 @si129tofp128(i129 %a) {
   %conv = sitofp i129 %a to fp128
   ret fp128 %conv
 }
+
+define <2 x float> @si129tofloatv2(<2 x i129> %a) {
+; CHECK-LABEL: @si129tofloatv2(
+; CHECK-NEXT:  itofp-entryitofp-entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i129> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i129 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[ITOFP_RETURN1:%.*]], label [[ITOFP_IF_END2:%.*]]
+; CHECK:       itofp-if-end2:
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr i129 [[TMP0]], 128
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i129 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i129 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP4]], i1 true)
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i129 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 129, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i32 128, [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP7]], 24
+; CHECK-NEXT:    br i1 [[TMP9]], label [[ITOFP_IF_THEN43:%.*]], label [[ITOFP_IF_ELSE8:%.*]]
+; CHECK:       itofp-if-then43:
+; CHECK-NEXT:    switch i32 [[TMP7]], label [[ITOFP_SW_DEFAULT5:%.*]] [
+; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB4:%.*]]
+; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG6:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       itofp-sw-bb4:
+; CHECK-NEXT:    [[TMP10:%.*]] = shl i129 [[TMP4]], 1
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG6]]
+; CHECK:       itofp-sw-default5:
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 103, [[TMP6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i129
+; CHECK-NEXT:    [[TMP13:%.*]] = lshr i129 [[TMP4]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP6]], 26
+; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP14]] to i129
+; CHECK-NEXT:    [[TMP16:%.*]] = lshr i129 -1, [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = and i129 [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne i129 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i129
+; CHECK-NEXT:    [[TMP20:%.*]] = or i129 [[TMP13]], [[TMP19]]
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG6]]
+; CHECK:       itofp-sw-epilog6:
+; CHECK-NEXT:    [[TMP21:%.*]] = phi i129 [ [[TMP20]], [[ITOFP_SW_DEFAULT5]] ], [ [[TMP4]], [[ITOFP_IF_THEN43]] ], [ [[TMP10]], [[ITOFP_SW_BB4]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = trunc i129 [[TMP21]] to i32
+; CHECK-NEXT:    [[TMP23:%.*]] = lshr i32 [[TMP22]], 2
+; CHECK-NEXT:    [[TMP24:%.*]] = and i32 [[TMP23]], 1
+; CHECK-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP24]] to i129
+; CHECK-NEXT:    [[TMP26:%.*]] = or i129 [[TMP21]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = add i129 [[TMP26]], 1
+; CHECK-NEXT:    [[TMP28:%.*]] = ashr i129 [[TMP27]], 2
+; CHECK-NEXT:    [[A310:%.*]] = and i129 [[TMP27]], 67108864
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i129 [[A310]], 0
+; CHECK-NEXT:    [[TMP30:%.*]] = trunc i129 [[TMP28]] to i32
+; CHECK-NEXT:    [[TMP31:%.*]] = lshr i129 [[TMP28]], 32
+; CHECK-NEXT:    [[TMP32:%.*]] = trunc i129 [[TMP31]] to i32
+; CHECK-NEXT:    br i1 [[TMP29]], label [[ITOFP_IF_END269:%.*]], label [[ITOFP_IF_THEN207:%.*]]
+; CHECK:       itofp-if-then207:
+; CHECK-NEXT:    [[TMP33:%.*]] = ashr i129 [[TMP27]], 3
+; CHECK-NEXT:    [[TMP34:%.*]] = trunc i129 [[TMP33]] to i32
+; CHECK-NEXT:    [[TMP35:%.*]] = lshr i129 [[TMP33]], 32
+; CHECK-NEXT:    [[TMP36:%.*]] = trunc i129 [[TMP35]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END269]]
+; CHECK:       itofp-if-else8:
+; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP6]], -105
+; CHECK-NEXT:    [[TMP38:%.*]] = zext i32 [[TMP37]] to i129
+; CHECK-NEXT:    [[TMP39:%.*]] = shl i129 [[TMP4]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = trunc i129 [[TMP39]] to i32
+; CHECK-NEXT:    [[TMP41:%.*]] = lshr i129 [[TMP39]], 32
+; CHECK-NEXT:    [[TMP42:%.*]] = trunc i129 [[TMP41]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END269]]
+; CHECK:       itofp-if-end269:
+; CHECK-NEXT:    [[TMP43:%.*]] = phi i32 [ [[TMP34]], [[ITOFP_IF_THEN207]] ], [ [[TMP30]], [[ITOFP_SW_EPILOG6]] ], [ [[TMP40]], [[ITOFP_IF_ELSE8]] ]
+; CHECK-NEXT:    [[TMP44:%.*]] = phi i32 [ [[TMP7]], [[ITOFP_IF_THEN207]] ], [ [[TMP8]], [[ITOFP_SW_EPILOG6]] ], [ [[TMP8]], [[ITOFP_IF_ELSE8]] ]
+; CHECK-NEXT:    [[TMP45:%.*]] = trunc i129 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP46:%.*]] = and i32 [[TMP45]], -2147483648
+; CHECK-NEXT:    [[TMP47:%.*]] = shl i32 [[TMP44]], 23
+; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP47]], 1065353216
+; CHECK-NEXT:    [[TMP49:%.*]] = and i32 [[TMP43]], 8388607
+; CHECK-NEXT:    [[TMP50:%.*]] = or i32 [[TMP49]], [[TMP46]]
+; CHECK-NEXT:    [[TMP51:%.*]] = or i32 [[TMP50]], [[TMP48]]
+; CHECK-NEXT:    [[TMP52:%.*]] = bitcast i32 [[TMP51]] to float
+; CHECK-NEXT:    br label [[ITOFP_RETURN1]]
+; CHECK:       itofp-return1:
+; CHECK-NEXT:    [[TMP53:%.*]] = phi float [ [[TMP52]], [[ITOFP_IF_END269]] ], [ 0.000000e+00, [[ITOFP_ENTRYITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <2 x float> poison, float [[TMP53]], i64 0
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <2 x i129> [[A]], i64 1
+; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i129 [[TMP55]], 0
+; CHECK-NEXT:    br i1 [[TMP56]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
+; CHECK:       itofp-if-end:
+; CHECK-NEXT:    [[TMP57:%.*]] = ashr i129 [[TMP55]], 128
+; CHECK-NEXT:    [[TMP58:%.*]] = xor i129 [[TMP57]], [[TMP55]]
+; CHECK-NEXT:    [[TMP59:%.*]] = sub i129 [[TMP58]], [[TMP57]]
+; CHECK-NEXT:    [[TMP60:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP59]], i1 true)
+; CHECK-NEXT:    [[TMP61:%.*]] = trunc i129 [[TMP60]] to i32
+; CHECK-NEXT:    [[TMP62:%.*]] = sub i32 129, [[TMP61]]
+; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 128, [[TMP61]]
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], 24
+; CHECK-NEXT:    br i1 [[TMP64]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
+; CHECK:       itofp-if-then4:
+; CHECK-NEXT:    switch i32 [[TMP62]], label [[ITOFP_SW_DEFAULT:%.*]] [
+; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       itofp-sw-bb:
+; CHECK-NEXT:    [[TMP65:%.*]] = shl i129 [[TMP59]], 1
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
+; CHECK:       itofp-sw-default:
+; CHECK-NEXT:    [[TMP66:%.*]] = sub i32 103, [[TMP61]]
+; CHECK-NEXT:    [[TMP67:%.*]] = zext i32 [[TMP66]] to i129
+; CHECK-NEXT:    [[TMP68:%.*]] = lshr i129 [[TMP59]], [[TMP67]]
+; CHECK-NEXT:    [[TMP69:%.*]] = add i32 [[TMP61]], 26
+; CHECK-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP69]] to i129
+; CHECK-NEXT:    [[TMP71:%.*]] = lshr i129 -1, [[TMP70]]
+; CHECK-NEXT:    [[TMP72:%.*]] = and i129 [[TMP71]], [[TMP59]]
+; CHECK-NEXT:    [[TMP73:%.*]] = icmp ne i129 [[TMP72]], 0
+; CHECK-NEXT:    [[TMP74:%.*]] = zext i1 [[TMP73]] to i129
+; CHECK-NEXT:    [[TMP75:%.*]] = or i129 [[TMP68]], [[TMP74]]
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
+; CHECK:       itofp-sw-epilog:
+; CHECK-NEXT:    [[TMP76:%.*]] = phi i129 [ [[TMP75]], [[ITOFP_SW_DEFAULT]] ], [ [[TMP59]], [[ITOFP_IF_THEN4]] ], [ [[TMP65]], [[ITOFP_SW_BB]] ]
+; CHECK-NEXT:    [[TMP77:%.*]] = trunc i129 [[TMP76]] to i32
+; CHECK-NEXT:    [[TMP78:%.*]] = lshr i32 [[TMP77]], 2
+; CHECK-NEXT:    [[TMP79:%.*]] = and i32 [[TMP78]], 1
+; CHECK-NEXT:    [[TMP80:%.*]] = zext i32 [[TMP79]] to i129
+; CHECK-NEXT:    [[TMP81:%.*]] = or i129 [[TMP76]], [[TMP80]]
+; CHECK-NEXT:    [[TMP82:%.*]] = add i129 [[TMP81]], 1
+; CHECK-NEXT:    [[TMP83:%.*]] = ashr i129 [[TMP82]], 2
+; CHECK-NEXT:    [[A3:%.*]] = and i129 [[TMP82]], 67108864
+; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i129 [[A3]], 0
+; CHECK-NEXT:    [[TMP85:%.*]] = trunc i129 [[TMP83]] to i32
+; CHECK-NEXT:    [[TMP86:%.*]] = lshr i129 [[TMP83]], 32
+; CHECK-NEXT:    [[TMP87:%.*]] = trunc i129 [[TMP86]] to i32
+; CHECK-NEXT:    br i1 [[TMP84]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
+; CHECK:       itofp-if-then20:
+; CHECK-NEXT:    [[TMP88:%.*]] = ashr i129 [[TMP82]], 3
+; CHECK-NEXT:    [[TMP89:%.*]] = trunc i129 [[TMP88]] to i32
+; CHECK-NEXT:    [[TMP90:%.*]] = lshr i129 [[TMP88]], 32
+; CHECK-NEXT:    [[TMP91:%.*]] = trunc i129 [[TMP90]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
+; CHECK:       itofp-if-else:
+; CHECK-NEXT:    [[TMP92:%.*]] = add i32 [[TMP61]], -105
+; CHECK-NEXT:    [[TMP93:%.*]] = zext i32 [[TMP92]] to i129
+; CHECK-NEXT:    [[TMP94:%.*]] = shl i129 [[TMP59]], [[TMP93]]
+; CHECK-NEXT:    [[TMP95:%.*]] = trunc i129 [[TMP94]] to i32
+; CHECK-NEXT:    [[TMP96:%.*]] = lshr i129 [[TMP94]], 32
+; CHECK-NEXT:    [[TMP97:%.*]] = trunc i129 [[TMP96]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
+; CHECK:       itofp-if-end26:
+; CHECK-NEXT:    [[TMP98:%.*]] = phi i32 [ [[TMP89]], [[ITOFP_IF_THEN20]] ], [ [[TMP85]], [[ITOFP_SW_EPILOG]] ], [ [[TMP95]], [[ITOFP_IF_ELSE]] ]
+; CHECK-NEXT:    [[TMP99:%.*]] = phi i32 [ [[TMP62]], [[ITOFP_IF_THEN20]] ], [ [[TMP63]], [[ITOFP_SW_EPILOG]] ], [ [[TMP63]], [[ITOFP_IF_ELSE]] ]
+; CHECK-NEXT:    [[TMP100:%.*]] = trunc i129 [[TMP57]] to i32
+; CHECK-NEXT:    [[TMP101:%.*]] = and i32 [[TMP100]], -2147483648
+; CHECK-NEXT:    [[TMP102:%.*]] = shl i32 [[TMP99]], 23
+; CHECK-NEXT:    [[TMP103:%.*]] = add i32 [[TMP102]], 1065353216
+; CHECK-NEXT:    [[TMP104:%.*]] = and i32 [[TMP98]], 8388607
+; CHECK-NEXT:    [[TMP105:%.*]] = or i32 [[TMP104]], [[TMP101]]
+; CHECK-NEXT:    [[TMP106:%.*]] = or i32 [[TMP105]], [[TMP103]]
+; CHECK-NEXT:    [[TMP107:%.*]] = bitcast i32 [[TMP106]] to float
+; CHECK-NEXT:    br label [[ITOFP_RETURN]]
+; CHECK:       itofp-return:
+; CHECK-NEXT:    [[TMP108:%.*]] = phi float [ [[TMP107]], [[ITOFP_IF_END26]] ], [ 0.000000e+00, [[ITOFP_RETURN1]] ]
+; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <2 x float> [[TMP54]], float [[TMP108]], i64 1
+; CHECK-NEXT:    ret <2 x float> [[TMP109]]
+;
+  %conv = sitofp <2 x i129> %a to <2 x float>
+  ret <2 x float> %conv
+}
diff --git a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll
index 96d87a5..ee54d53 100644
--- a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll
+++ b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll
@@ -426,3 +426,166 @@ define fp128 @ui129tofp128(i129 %a) {
   %conv = uitofp i129 %a to fp128
   ret fp128 %conv
 }
+
+define <2 x float> @ui129tofloatv2(<2 x i129> %a) {
+; CHECK-LABEL: @ui129tofloatv2(
+; CHECK-NEXT:  itofp-entryitofp-entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i129> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i129 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[ITOFP_RETURN1:%.*]], label [[ITOFP_IF_END2:%.*]]
+; CHECK:       itofp-if-end2:
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr i129 [[TMP0]], 128
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i129 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i129 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP0]], i1 true)
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i129 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 129, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i32 128, [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP7]], 24
+; CHECK-NEXT:    br i1 [[TMP9]], label [[ITOFP_IF_THEN43:%.*]], label [[ITOFP_IF_ELSE8:%.*]]
+; CHECK:       itofp-if-then43:
+; CHECK-NEXT:    switch i32 [[TMP7]], label [[ITOFP_SW_DEFAULT5:%.*]] [
+; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB4:%.*]]
+; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG6:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       itofp-sw-bb4:
+; CHECK-NEXT:    [[TMP10:%.*]] = shl i129 [[TMP0]], 1
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG6]]
+; CHECK:       itofp-sw-default5:
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 103, [[TMP6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i129
+; CHECK-NEXT:    [[TMP13:%.*]] = lshr i129 [[TMP0]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP6]], 26
+; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP14]] to i129
+; CHECK-NEXT:    [[TMP16:%.*]] = lshr i129 -1, [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = and i129 [[TMP16]], [[TMP0]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne i129 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i129
+; CHECK-NEXT:    [[TMP20:%.*]] = or i129 [[TMP13]], [[TMP19]]
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG6]]
+; CHECK:       itofp-sw-epilog6:
+; CHECK-NEXT:    [[TMP21:%.*]] = phi i129 [ [[TMP20]], [[ITOFP_SW_DEFAULT5]] ], [ [[TMP0]], [[ITOFP_IF_THEN43]] ], [ [[TMP10]], [[ITOFP_SW_BB4]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = trunc i129 [[TMP21]] to i32
+; CHECK-NEXT:    [[TMP23:%.*]] = lshr i32 [[TMP22]], 2
+; CHECK-NEXT:    [[TMP24:%.*]] = and i32 [[TMP23]], 1
+; CHECK-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP24]] to i129
+; CHECK-NEXT:    [[TMP26:%.*]] = or i129 [[TMP21]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = add i129 [[TMP26]], 1
+; CHECK-NEXT:    [[TMP28:%.*]] = lshr i129 [[TMP27]], 2
+; CHECK-NEXT:    [[A310:%.*]] = and i129 [[TMP27]], 67108864
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i129 [[A310]], 0
+; CHECK-NEXT:    [[TMP30:%.*]] = trunc i129 [[TMP28]] to i32
+; CHECK-NEXT:    [[TMP31:%.*]] = lshr i129 [[TMP28]], 32
+; CHECK-NEXT:    [[TMP32:%.*]] = trunc i129 [[TMP31]] to i32
+; CHECK-NEXT:    br i1 [[TMP29]], label [[ITOFP_IF_END269:%.*]], label [[ITOFP_IF_THEN207:%.*]]
+; CHECK:       itofp-if-then207:
+; CHECK-NEXT:    [[TMP33:%.*]] = lshr i129 [[TMP27]], 3
+; CHECK-NEXT:    [[TMP34:%.*]] = trunc i129 [[TMP33]] to i32
+; CHECK-NEXT:    [[TMP35:%.*]] = lshr i129 [[TMP33]], 32
+; CHECK-NEXT:    [[TMP36:%.*]] = trunc i129 [[TMP35]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END269]]
+; CHECK:       itofp-if-else8:
+; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP6]], -105
+; CHECK-NEXT:    [[TMP38:%.*]] = zext i32 [[TMP37]] to i129
+; CHECK-NEXT:    [[TMP39:%.*]] = shl i129 [[TMP0]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = trunc i129 [[TMP39]] to i32
+; CHECK-NEXT:    [[TMP41:%.*]] = lshr i129 [[TMP39]], 32
+; CHECK-NEXT:    [[TMP42:%.*]] = trunc i129 [[TMP41]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END269]]
+; CHECK:       itofp-if-end269:
+; CHECK-NEXT:    [[TMP43:%.*]] = phi i32 [ [[TMP34]], [[ITOFP_IF_THEN207]] ], [ [[TMP30]], [[ITOFP_SW_EPILOG6]] ], [ [[TMP40]], [[ITOFP_IF_ELSE8]] ]
+; CHECK-NEXT:    [[TMP44:%.*]] = phi i32 [ [[TMP7]], [[ITOFP_IF_THEN207]] ], [ [[TMP8]], [[ITOFP_SW_EPILOG6]] ], [ [[TMP8]], [[ITOFP_IF_ELSE8]] ]
+; CHECK-NEXT:    [[TMP45:%.*]] = trunc i129 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP46:%.*]] = and i32 [[TMP45]], -2147483648
+; CHECK-NEXT:    [[TMP47:%.*]] = shl i32 [[TMP44]], 23
+; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP47]], 1065353216
+; CHECK-NEXT:    [[TMP49:%.*]] = and i32 [[TMP43]], 8388607
+; CHECK-NEXT:    [[TMP50:%.*]] = or i32 [[TMP49]], [[TMP46]]
+; CHECK-NEXT:    [[TMP51:%.*]] = or i32 [[TMP49]], [[TMP48]]
+; CHECK-NEXT:    [[TMP52:%.*]] = bitcast i32 [[TMP51]] to float
+; CHECK-NEXT:    br label [[ITOFP_RETURN1]]
+; CHECK:       itofp-return1:
+; CHECK-NEXT:    [[TMP53:%.*]] = phi float [ [[TMP52]], [[ITOFP_IF_END269]] ], [ 0.000000e+00, [[ITOFP_ENTRYITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <2 x float> poison, float [[TMP53]], i64 0
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <2 x i129> [[A]], i64 1
+; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i129 [[TMP55]], 0
+; CHECK-NEXT:    br i1 [[TMP56]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
+; CHECK:       itofp-if-end:
+; CHECK-NEXT:    [[TMP57:%.*]] = ashr i129 [[TMP55]], 128
+; CHECK-NEXT:    [[TMP58:%.*]] = xor i129 [[TMP57]], [[TMP55]]
+; CHECK-NEXT:    [[TMP59:%.*]] = sub i129 [[TMP58]], [[TMP57]]
+; CHECK-NEXT:    [[TMP60:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP55]], i1 true)
+; CHECK-NEXT:    [[TMP61:%.*]] = trunc i129 [[TMP60]] to i32
+; CHECK-NEXT:    [[TMP62:%.*]] = sub i32 129, [[TMP61]]
+; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 128, [[TMP61]]
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], 24
+; CHECK-NEXT:    br i1 [[TMP64]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
+; CHECK:       itofp-if-then4:
+; CHECK-NEXT:    switch i32 [[TMP62]], label [[ITOFP_SW_DEFAULT:%.*]] [
+; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       itofp-sw-bb:
+; CHECK-NEXT:    [[TMP65:%.*]] = shl i129 [[TMP55]], 1
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
+; CHECK:       itofp-sw-default:
+; CHECK-NEXT:    [[TMP66:%.*]] = sub i32 103, [[TMP61]]
+; CHECK-NEXT:    [[TMP67:%.*]] = zext i32 [[TMP66]] to i129
+; CHECK-NEXT:    [[TMP68:%.*]] = lshr i129 [[TMP55]], [[TMP67]]
+; CHECK-NEXT:    [[TMP69:%.*]] = add i32 [[TMP61]], 26
+; CHECK-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP69]] to i129
+; CHECK-NEXT:    [[TMP71:%.*]] = lshr i129 -1, [[TMP70]]
+; CHECK-NEXT:    [[TMP72:%.*]] = and i129 [[TMP71]], [[TMP55]]
+; CHECK-NEXT:    [[TMP73:%.*]] = icmp ne i129 [[TMP72]], 0
+; CHECK-NEXT:    [[TMP74:%.*]] = zext i1 [[TMP73]] to i129
+; CHECK-NEXT:    [[TMP75:%.*]] = or i129 [[TMP68]], [[TMP74]]
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
+; CHECK:       itofp-sw-epilog:
+; CHECK-NEXT:    [[TMP76:%.*]] = phi i129 [ [[TMP75]], [[ITOFP_SW_DEFAULT]] ], [ [[TMP55]], [[ITOFP_IF_THEN4]] ], [ [[TMP65]], [[ITOFP_SW_BB]] ]
+; CHECK-NEXT:    [[TMP77:%.*]] = trunc i129 [[TMP76]] to i32
+; CHECK-NEXT:    [[TMP78:%.*]] = lshr i32 [[TMP77]], 2
+; CHECK-NEXT:    [[TMP79:%.*]] = and i32 [[TMP78]], 1
+; CHECK-NEXT:    [[TMP80:%.*]] = zext i32 [[TMP79]] to i129
+; CHECK-NEXT:    [[TMP81:%.*]] = or i129 [[TMP76]], [[TMP80]]
+; CHECK-NEXT:    [[TMP82:%.*]] = add i129 [[TMP81]], 1
+; CHECK-NEXT:    [[TMP83:%.*]] = lshr i129 [[TMP82]], 2
+; CHECK-NEXT:    [[A3:%.*]] = and i129 [[TMP82]], 67108864
+; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i129 [[A3]], 0
+; CHECK-NEXT:    [[TMP85:%.*]] = trunc i129 [[TMP83]] to i32
+; CHECK-NEXT:    [[TMP86:%.*]] = lshr i129 [[TMP83]], 32
+; CHECK-NEXT:    [[TMP87:%.*]] = trunc i129 [[TMP86]] to i32
+; CHECK-NEXT:    br i1 [[TMP84]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
+; CHECK:       itofp-if-then20:
+; CHECK-NEXT:    [[TMP88:%.*]] = lshr i129 [[TMP82]], 3
+; CHECK-NEXT:    [[TMP89:%.*]] = trunc i129 [[TMP88]] to i32
+; CHECK-NEXT:    [[TMP90:%.*]] = lshr i129 [[TMP88]], 32
+; CHECK-NEXT:    [[TMP91:%.*]] = trunc i129 [[TMP90]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
+; CHECK:       itofp-if-else:
+; CHECK-NEXT:    [[TMP92:%.*]] = add i32 [[TMP61]], -105
+; CHECK-NEXT:    [[TMP93:%.*]] = zext i32 [[TMP92]] to i129
+; CHECK-NEXT:    [[TMP94:%.*]] = shl i129 [[TMP55]], [[TMP93]]
+; CHECK-NEXT:    [[TMP95:%.*]] = trunc i129 [[TMP94]] to i32
+; CHECK-NEXT:    [[TMP96:%.*]] = lshr i129 [[TMP94]], 32
+; CHECK-NEXT:    [[TMP97:%.*]] = trunc i129 [[TMP96]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
+; CHECK:       itofp-if-end26:
+; CHECK-NEXT:    [[TMP98:%.*]] = phi i32 [ [[TMP89]], [[ITOFP_IF_THEN20]] ], [ [[TMP85]], [[ITOFP_SW_EPILOG]] ], [ [[TMP95]], [[ITOFP_IF_ELSE]] ]
+; CHECK-NEXT:    [[TMP99:%.*]] = phi i32 [ [[TMP62]], [[ITOFP_IF_THEN20]] ], [ [[TMP63]], [[ITOFP_SW_EPILOG]] ], [ [[TMP63]], [[ITOFP_IF_ELSE]] ]
+; CHECK-NEXT:    [[TMP100:%.*]] = trunc i129 [[TMP57]] to i32
+; CHECK-NEXT:    [[TMP101:%.*]] = and i32 [[TMP100]], -2147483648
+; CHECK-NEXT:    [[TMP102:%.*]] = shl i32 [[TMP99]], 23
+; CHECK-NEXT:    [[TMP103:%.*]] = add i32 [[TMP102]], 1065353216
+; CHECK-NEXT:    [[TMP104:%.*]] = and i32 [[TMP98]], 8388607
+; CHECK-NEXT:    [[TMP105:%.*]] = or i32 [[TMP104]], [[TMP101]]
+; CHECK-NEXT:    [[TMP106:%.*]] = or i32 [[TMP104]], [[TMP103]]
+; CHECK-NEXT:    [[TMP107:%.*]] = bitcast i32 [[TMP106]] to float
+; CHECK-NEXT:    br label [[ITOFP_RETURN]]
+; CHECK:       itofp-return:
+; CHECK-NEXT:    [[TMP108:%.*]] = phi float [ [[TMP107]], [[ITOFP_IF_END26]] ], [ 0.000000e+00, [[ITOFP_RETURN1]] ]
+; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <2 x float> [[TMP54]], float [[TMP108]], i64 1
+; CHECK-NEXT:    ret <2 x float> [[TMP109]]
+;
+  %conv = uitofp <2 x i129> %a to <2 x float>
+  ret <2 x float> %conv
+}
diff --git a/llvm/test/Transforms/InstCombine/implies.ll b/llvm/test/Transforms/InstCombine/implies.ll
new file mode 100644
index 0000000..c02d84d
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/implies.ll
@@ -0,0 +1,424 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i1 @or_implies_sle(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_implies_sle(
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[X:%.*]], 23
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[OR]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %or = or i8 %x, 23
+  %cond = icmp sle i8 %or, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @or_implies_sle_fail(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_implies_sle_fail(
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[X:%.*]], -34
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[OR]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %or = or i8 %x, -34
+  %cond = icmp sle i8 %or, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @or_distjoint_implies_ule(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_distjoint_implies_ule(
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = or disjoint i8 %x, 23
+  %x2 = or disjoint i8 %x, 24
+
+  %cond = icmp ule i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @or_distjoint_implies_ule_fail(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_distjoint_implies_ule_fail(
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[X1:%.*]] = or disjoint i8 [[X]], 28
+; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X1]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = or disjoint i8 %x, 28
+  %x2 = or disjoint i8 %x, 24
+
+  %cond = icmp ule i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @or_prove_distjoin_implies_ule(i8 %xx, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_prove_distjoin_implies_ule(
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[XX:%.*]], -16
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X]], 10
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x = and i8 %xx, -16
+  %x1 = or i8 %x, 7
+  %x2 = or i8 %x, 10
+
+  %cond = icmp ule i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_or_distjoint_implies_sle(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @src_or_distjoint_implies_sle(
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = or disjoint i8 %x, 23
+  %x2 = or disjoint i8 %x, 24
+
+  %cond = icmp sle i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_or_distjoint_implies_sle_fail(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @src_or_distjoint_implies_sle_fail(
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp slt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[X1:%.*]] = or disjoint i8 [[X]], 23
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X1]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = or disjoint i8 %x, 23
+  %x2 = or disjoint i8 %x, 24
+
+  %cond = icmp sle i8 %y, %x2
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_addnsw_implies_sle(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @src_addnsw_implies_sle(
+; CHECK-NEXT:    [[X2:%.*]] = add nsw i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = add nsw i8 %x, 23
+  %x2 = add nsw i8 %x, 24
+
+  %cond = icmp sle i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_addnsw_implies_sle_fail(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @src_addnsw_implies_sle_fail(
+; CHECK-NEXT:    [[X2:%.*]] = add nsw i8 [[X:%.*]], 23
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[X1:%.*]] = add nsw i8 [[X]], 24
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X1]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = add nsw i8 %x, 24
+  %x2 = add nsw i8 %x, 23
+
+  %cond = icmp sle i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_and_implies_ult(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_and_implies_ult(
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp ult i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %and = and i8 %z, %x
+  %r = icmp ult i8 %and, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_and_implies_ult_fail(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_and_implies_ult_fail(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], [[Z]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[AND]], [[Z]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp ule i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %and = and i8 %x, %z
+  %r = icmp ult i8 %and, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_and_implies_slt_fail(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_and_implies_slt_fail(
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i8 [[AND]], [[Z]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp slt i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %and = and i8 %x, %y
+  %r = icmp slt i8 %and, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_or_implies_ule(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_or_implies_ule(
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[OR]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %or = or i8 %y, %x
+  %cond = icmp uge i8 %z, %or
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_or_implies_false_ugt_todo(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_or_implies_false_ugt_todo(
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i8 [[OR]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+; CHECK:       F:
+; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[X]], [[Z]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %or = or i8 %x, %y
+  %cond = icmp ugt i8 %or, %z
+  br i1 %cond, label %T, label %F
+T:
+  ret i1 %other
+F:
+  %r = icmp ugt i8 %x, %z
+  ret i1 %r
+
+}
+
+define i1 @src_udiv_implies_ult(i8 %x, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_udiv_implies_ult(
+; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i8 [[Z:%.*]], [[X:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp ugt i8 %z, %x
+  br i1 %cond, label %T, label %F
+T:
+  %and = udiv i8 %x, 3
+  %r = icmp ult i8 %and, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_udiv_implies_ult2(i8 %x, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_udiv_implies_ult2(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[Z:%.*]], [[X:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 true
+;
+  %cond = icmp ule i8 %z, %x
+  br i1 %cond, label %T, label %F
+T:
+  ret i1 %other
+F:
+  %and = udiv i8 %x, 3
+  %r = icmp ult i8 %and, %z
+  ret i1 %r
+}
+
+define i1 @src_smin_implies_sle(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_smin_implies_sle(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp sle i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %um = call i8 @llvm.smin.i8(i8 %x, i8 %y)
+  %r = icmp sle i8 %um, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_umin_implies_ule(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_umin_implies_ule(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp ule i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %um = call i8 @llvm.umin.i8(i8 %x, i8 %y)
+  %r = icmp ule i8 %um, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_umax_implies_ule(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_umax_implies_ule(
+; CHECK-NEXT:    [[UM:%.*]] = call i8 @llvm.umax.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[UM]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %um = call i8 @llvm.umax.i8(i8 %x, i8 %y)
+  %cond = icmp ule i8 %um, %z
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_smax_implies_sle(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_smax_implies_sle(
+; CHECK-NEXT:    [[UM:%.*]] = call i8 @llvm.smax.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[UM]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %um = call i8 @llvm.smax.i8(i8 %x, i8 %y)
+  %cond = icmp sle i8 %um, %z
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll
index 5305c78..769f766 100644
--- a/llvm/test/Transforms/InstCombine/known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/known-bits.ll
@@ -124,7 +124,6 @@ exit:
   ret i8 %or2
 }
 
-
 define i8 @test_cond_and_bothways(i8 %x) {
 ; CHECK-LABEL: @test_cond_and_bothways(
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X:%.*]], 91
@@ -181,8 +180,6 @@ exit:
   ret i8 %or2
 }
 
-
-
 define i8 @test_cond_and_commuted(i8 %x, i1 %c1, i1 %c2) {
 ; CHECK-LABEL: @test_cond_and_commuted(
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X:%.*]], 3
@@ -343,7 +340,7 @@ exit:
   ret i8 %or2
 }
 
-define i32 @test_icmp_trunc1(i32 %x){
+define i32 @test_icmp_trunc1(i32 %x) {
 ; CHECK-LABEL: @test_icmp_trunc1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[Y:%.*]] = trunc i32 [[X:%.*]] to i16
@@ -365,7 +362,7 @@ else:
   ret i32 0
 }
 
-define i32 @test_icmp_trunc_assume(i32 %x){
+define i32 @test_icmp_trunc_assume(i32 %x) {
 ; CHECK-LABEL: @test_icmp_trunc_assume(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[Y:%.*]] = trunc i32 [[X:%.*]] to i16
@@ -532,7 +529,106 @@ if.else:
   ret i1 %other
 }
 
+define i8 @and_eq_bits_must_be_set(i8 %x, i8 %y) {
+; CHECK-LABEL: @and_eq_bits_must_be_set(
+; CHECK-NEXT:    [[XY:%.*]] = and i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 123
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i8 1
+;
+  %xy = and i8 %x, %y
+  %cmp = icmp eq i8 %xy, 123
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %x, 1
+  ret i8 %r
+}
+
+define i8 @and_eq_bits_must_be_set2(i8 %x, i8 %y) {
+; CHECK-LABEL: @and_eq_bits_must_be_set2(
+; CHECK-NEXT:    [[XY:%.*]] = and i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 123
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i8 11
+;
+  %xy = and i8 %x, %y
+  %cmp = icmp eq i8 %xy, 123
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %y, 11
+  ret i8 %r
+}
+
+define i8 @and_eq_bits_must_be_set2_partial_fail(i8 %x, i8 %y) {
+; CHECK-LABEL: @and_eq_bits_must_be_set2_partial_fail(
+; CHECK-NEXT:    [[XY:%.*]] = and i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 123
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[Y]], 111
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xy = and i8 %x, %y
+  %cmp = icmp eq i8 %xy, 123
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %y, 111
+  ret i8 %r
+}
+
+define i8 @or_eq_bits_must_be_unset(i8 %x, i8 %y) {
+; CHECK-LABEL: @or_eq_bits_must_be_unset(
+; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 124
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i8 0
+;
+  %xy = or i8 %x, %y
+  %cmp = icmp eq i8 %xy, 124
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %x, 3
+  ret i8 %r
+}
+
+define i8 @or_eq_bits_must_be_unset2(i8 %x, i8 %y) {
+; CHECK-LABEL: @or_eq_bits_must_be_unset2(
+; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 124
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i8 0
+;
+  %xy = or i8 %x, %y
+  %cmp = icmp eq i8 %xy, 124
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %y, 1
+  ret i8 %r
+}
 
+define i8 @or_eq_bits_must_be_unset2_partial_fail(i8 %x, i8 %y) {
+; CHECK-LABEL: @or_eq_bits_must_be_unset2_partial_fail(
+; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 124
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[Y]], 4
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xy = or i8 %x, %y
+  %cmp = icmp eq i8 %xy, 124
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %y, 7
+  ret i8 %r
+}
+
+define i8 @or_ne_bits_must_be_unset2_fail(i8 %x, i8 %y) {
+; CHECK-LABEL: @or_ne_bits_must_be_unset2_fail(
+; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[XY]], 124
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[X]], 3
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xy = or i8 %x, %y
+  %cmp = icmp ne i8 %xy, 124
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %x, 3
+  ret i8 %r
+}
 
 declare void @use(i1)
 declare void @sink(i8)
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index 278cabd..05fcf66 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -3693,6 +3693,800 @@ exit:
   ret i32 %rem
 }
 
+; Select icmp and/or/xor
+; https://alive2.llvm.org/ce/z/QXQDwF
+; X&Y==C?X|Y:X^Y, X&Y==C?X^Y:X|Y
+; TODO: X&Y==0 could imply no_common_bit to TrueValue
+define i32 @src_and_eq_0_or_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_and_eq_0_or_xor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[OR]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, 0
+  %or = or i32 %y, %x
+  %xor = xor i32 %y, %x
+  %cond = select i1 %cmp, i32 %or, i32 %xor
+  ret i32 %cond
+}
+
+; TODO: X&Y==0 could imply no_common_bit to TrueValue
+define i32 @src_and_eq_0_xor_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_and_eq_0_xor_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[XOR]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, 0
+  %xor = xor i32 %y, %x
+  %or = or i32 %y, %x
+  %cond = select i1 %cmp, i32 %xor, i32 %or
+  ret i32 %cond
+}
+
+; TODO: X&Y==-1 could imply all_common_bit to TrueValue
+define i32 @src_and_eq_neg1_or_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_and_eq_neg1_or_xor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], -1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[OR]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, -1
+  %or = or i32 %y, %x
+  %xor = xor i32 %y, %x
+  %cond = select i1 %cmp, i32 %or, i32 %xor
+  ret i32 %cond
+}
+
+; TODO: X&Y==-1 could imply all_common_bit to TrueValue
+define i32 @src_and_eq_neg1_xor_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_and_eq_neg1_xor_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[XOR]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, -1
+  %xor = xor i32 %y, %x
+  %or = or i32 %y, %x
+  %cond = select i1 %cmp, i32 %xor, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_and_eq_C_or_xororC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_and_eq_C_or_xororC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i32 [[XOR]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[OR1]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, %c
+  %or = or i32 %y, %x
+  %xor = xor i32 %y, %x
+  %or1 = or i32 %xor, %c
+  %cond = select i1 %cmp, i32 %or, i32 %or1
+  ret i32 %cond
+}
+
+define i32 @src_and_eq_C_or_xorxorC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_and_eq_C_or_xorxorC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[XOR]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR1]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, %c
+  %or = or i32 %y, %x
+  %xor = xor i32 %y, %x
+  %xor1 = xor i32 %xor, %c
+  %cond = select i1 %cmp, i32 %or, i32 %xor1
+  ret i32 %cond
+}
+
+define i32 @src_and_eq_C_xor_OrAndNotC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_and_eq_C_xor_OrAndNotC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[C:%.*]], -1
+; CHECK-NEXT:    [[AND1:%.*]] = and i32 [[OR]], [[NOT]]
+; CHECK-NEXT:    ret i32 [[AND1]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, %c
+  %xor = xor i32 %y, %x
+  %or = or i32 %y, %x
+  %not = xor i32 %c, -1
+  %and1 = and i32 %or, %not
+  %cond = select i1 %cmp, i32 %xor, i32 %and1
+  ret i32 %cond
+}
+
+define i32 @src_and_eq_C_xor_orxorC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_and_eq_C_xor_orxorC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[OR]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR1]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, %c
+  %xor = xor i32 %y, %x
+  %or = or i32 %y, %x
+  %xor1 = xor i32 %or, %c
+  %cond = select i1 %cmp, i32 %xor, i32 %xor1
+  ret i32 %cond
+}
+
+; https://alive2.llvm.org/ce/z/9RPwfN
+; X|Y==C?X&Y:X^Y, X|Y==C?X^Y:X&Y
+; TODO: X|Y==0 could imply no_common_bit to TrueValue
+define i32 @src_or_eq_0_and_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_or_eq_0_and_xor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[OR]], 0
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[AND]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, 0
+  %and = and i32 %y, %x
+  %xor = xor i32 %y, %x
+  %cond = select i1 %cmp, i32 %and, i32 %xor
+  ret i32 %cond
+}
+
+; TODO: X|Y==0 could imply no_common_bit to TrueValue
+define i32 @src_or_eq_0_xor_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_or_eq_0_xor_and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[OR]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[XOR]], i32 [[AND]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, 0
+  %xor = xor i32 %y, %x
+  %and = and i32 %y, %x
+  %cond = select i1 %cmp, i32 %xor, i32 %and
+  ret i32 %cond
+}
+
+define i32 @src_or_eq_neg1_and_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_or_eq_neg1_and_xor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[TMP0]], -1
+; CHECK-NEXT:    ret i32 [[NOT]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, -1
+  %and = and i32 %y, %x
+  %0 = xor i32 %x, %y
+  %not = xor i32 %0, -1
+  %cond = select i1 %cmp, i32 %and, i32 %not
+  ret i32 %cond
+}
+
+define i32 @src_or_eq_neg1_xor_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_or_eq_neg1_xor_and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[AND]], -1
+; CHECK-NEXT:    ret i32 [[NOT]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, -1
+  %xor = xor i32 %y, %x
+  %and = and i32 %y, %x
+  %not = xor i32 %and, -1
+  %cond = select i1 %cmp, i32 %xor, i32 %not
+  ret i32 %cond
+}
+
+define i32 @src_or_eq_C_and_xorC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_or_eq_C_and_xorC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[XOR]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR1]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, %c
+  %and = and i32 %y, %x
+  %xor = xor i32 %y, %x
+  %xor1 = xor i32 %xor, %c
+  %cond = select i1 %cmp, i32 %and, i32 %xor1
+  ret i32 %cond
+}
+
+define i32 @src_or_eq_C_and_andnotxorC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_or_eq_C_and_andnotxorC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[TMP0]], -1
+; CHECK-NEXT:    [[AND1:%.*]] = and i32 [[NOT]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[AND1]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, %c
+  %and = and i32 %y, %x
+  %0 = xor i32 %x, %y
+  %not = xor i32 %0, -1
+  %and1 = and i32 %not, %c
+  %cond = select i1 %cmp, i32 %and, i32 %and1
+  ret i32 %cond
+}
+
+define i32 @src_or_eq_C_xor_xorandC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_or_eq_C_xor_xorandC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[AND]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR1]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, %c
+  %xor = xor i32 %y, %x
+  %and = and i32 %y, %x
+  %xor1 = xor i32 %and, %c
+  %cond = select i1 %cmp, i32 %xor, i32 %xor1
+  ret i32 %cond
+}
+
+define i32 @src_or_eq_C_xor_andnotandC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_or_eq_C_xor_andnotandC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[AND]], -1
+; CHECK-NEXT:    [[AND1:%.*]] = and i32 [[NOT]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[AND1]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, %c
+  %xor = xor i32 %y, %x
+  %and = and i32 %y, %x
+  %not = xor i32 %and, -1
+  %and1 = and i32 %not, %c
+  %cond = select i1 %cmp, i32 %xor, i32 %and1
+  ret i32 %cond
+}
+
+; https://alive2.llvm.org/ce/z/c6oXi4
+; X^Y==C?X&Y:X|Y, X^Y==C?X|Y:X&Y
+define i32 @src_xor_eq_neg1_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_xor_eq_neg1_and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[OR]], -1
+; CHECK-NEXT:    ret i32 [[NOT]]
+;
+entry:
+  %xor = xor i32 %y, %x
+  %cmp = icmp eq i32 %xor, -1
+  %and = and i32 %y, %x
+  %or = or i32 %y, %x
+  %not = xor i32 %or, -1
+  %cond = select i1 %cmp, i32 %and, i32 %not
+  ret i32 %cond
+}
+
+; TODO: X^Y==-1 could imply no_common_bit to TrueValue
+define i32 @src_xor_eq_neg1_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_xor_eq_neg1_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[XOR]], -1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[OR]], i32 -1
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %xor = xor i32 %y, %x
+  %cmp = icmp eq i32 %xor, -1
+  %or = or i32 %y, %x
+  %cond = select i1 %cmp, i32 %or, i32 -1
+  ret i32 %cond
+}
+
+define i32 @src_xor_eq_C_and_xororC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_xor_eq_C_and_xororC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[OR]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR1]]
+;
+entry:
+  %xor = xor i32 %y, %x
+  %cmp = icmp eq i32 %xor, %c
+  %and = and i32 %y, %x
+  %or = or i32 %y, %x
+  %xor1 = xor i32 %or, %c
+  %cond = select i1 %cmp, i32 %and, i32 %xor1
+  ret i32 %cond
+}
+
+define i32 @src_xor_eq_C_and_andornotC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_xor_eq_C_and_andornotC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[C:%.*]], -1
+; CHECK-NEXT:    [[AND1:%.*]] = and i32 [[OR]], [[NOT]]
+; CHECK-NEXT:    ret i32 [[AND1]]
+;
+entry:
+  %xor = xor i32 %y, %x
+  %cmp = icmp eq i32 %xor, %c
+  %and = and i32 %y, %x
+  %or = or i32 %y, %x
+  %not = xor i32 %c, -1
+  %and1 = and i32 %or, %not
+  %cond = select i1 %cmp, i32 %and, i32 %and1
+  ret i32 %cond
+}
+
+define i32 @src_xor_eq_C_or_xorandC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_xor_eq_C_or_xorandC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[AND]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR1]]
+;
+entry:
+  %xor = xor i32 %y, %x
+  %cmp = icmp eq i32 %xor, %c
+  %or = or i32 %y, %x
+  %and = and i32 %y, %x
+  %xor1 = xor i32 %and, %c
+  %cond = select i1 %cmp, i32 %or, i32 %xor1
+  ret i32 %cond
+}
+
+define i32 @src_xor_eq_C_or_orandC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_xor_eq_C_or_orandC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i32 [[AND]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[OR1]]
+;
+entry:
+  %xor = xor i32 %y, %x
+  %cmp = icmp eq i32 %xor, %c
+  %or = or i32 %y, %x
+  %and = and i32 %y, %x
+  %or1 = or i32 %and, %c
+  %cond = select i1 %cmp, i32 %or, i32 %or1
+  ret i32 %cond
+}
+
+; Select icmp and/or/xor
+; NO TRANSFORMED - select condition is compare with not 0
+define i32 @src_select_and_min_positive_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_and_min_positive_int(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND0:%.*]] = icmp eq i32 [[AND]], 1
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[AND0]], i32 [[OR]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, 1
+  %xor = xor i32 %x, %y
+  %or = or i32 %x, %y
+  %cond = select i1 %and0, i32 %or, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_select_and_max_positive_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_and_max_positive_int(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND0:%.*]] = icmp eq i32 [[AND]], 2147483647
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[AND0]], i32 [[OR]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, 2147483647
+  %xor = xor i32 %x, %y
+  %or = or i32 %x, %y
+  %cond = select i1 %and0, i32 %or, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_select_and_min_negative_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_and_min_negative_int(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND0:%.*]] = icmp eq i32 [[AND]], -2147483648
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[AND0]], i32 [[OR]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, -2147483648
+  %xor = xor i32 %x, %y
+  %or = or i32 %x, %y
+  %cond = select i1 %and0, i32 %or, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_select_or_min_positive_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_or_min_positive_int(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 [[AND]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, 1
+  %and = and i32 %x, %y
+  %xor = xor i32 %x, %y
+  %cond = select i1 %or0, i32 %and, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_select_or_max_positive_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_or_max_positive_int(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 2147483647
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 [[AND]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, 2147483647
+  %and = and i32 %x, %y
+  %xor = xor i32 %x, %y
+  %cond = select i1 %or0, i32 %and, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_select_or_min_negative_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_or_min_negative_int(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], -2147483648
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 [[AND]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, -2147483648
+  %and = and i32 %x, %y
+  %xor = xor i32 %x, %y
+  %cond = select i1 %or0, i32 %and, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_select_or_max_negative_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_or_max_negative_int(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 [[AND]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, -1
+  %and = and i32 %x, %y
+  %xor = xor i32 %x, %y
+  %cond = select i1 %or0, i32 %and, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_select_xor_min_positive_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_xor_min_positive_int(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[XOR]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 [[AND]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, 1
+  %and = and i32 %x, %y
+  %or = or i32 %x, %y
+  %cond = select i1 %xor0, i32 %and, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_select_xor_max_positive_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_xor_max_positive_int(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[XOR]], 2147483647
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 [[AND]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, 2147483647
+  %and = and i32 %x, %y
+  %or = or i32 %x, %y
+  %cond = select i1 %xor0, i32 %and, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_select_xor_min_negative_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_xor_min_negative_int(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[XOR]], -2147483648
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 [[AND]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, -2147483648
+  %and = and i32 %x, %y
+  %or = or i32 %x, %y
+  %cond = select i1 %xor0, i32 %and, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_select_xor_max_negative_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_xor_max_negative_int(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[XOR]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 [[AND]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, -1
+  %and = and i32 %x, %y
+  %or = or i32 %x, %y
+  %cond = select i1 %xor0, i32 %and, i32 %or
+  ret i32 %cond
+}
+
+; Select icmp and/or/xor
+; https://alive2.llvm.org/ce/z/BVgrJ-
+; NO TRANSFORMED - not supported
+define i32 @src_no_trans_select_and_eq0_and_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_and_eq0_and_or(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND0:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[AND0]], i32 0, i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, 0
+  %or = or i32 %x, %y
+  %cond = select i1 %and0, i32 %and, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_and_eq0_and_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_and_eq0_and_xor(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND0:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[AND0]], i32 0, i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, 0
+  %xor = xor i32 %x, %y
+  %cond = select i1 %and0, i32 %and, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_and_eq0_or_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_and_eq0_or_and(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND0:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[AND0]], i32 [[OR]], i32 [[AND]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, 0
+  %or = or i32 %x, %y
+  %cond = select i1 %and0, i32 %or, i32 %and
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_and_eq0_xor_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_and_eq0_xor_and(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND0:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[AND0]], i32 [[XOR]], i32 [[AND]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, 0
+  %xor = xor i32 %x, %y
+  %cond = select i1 %and0, i32 %xor, i32 %and
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_or_eq0_or_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_or_eq0_or_and(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 0
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 0, i32 [[AND]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, 0
+  %and = and i32 %x, %y
+  %cond = select i1 %or0, i32 %or, i32 %and
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_or_eq0_or_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_or_eq0_or_xor(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 0, i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, 0
+  %xor = xor i32 %x, %y
+  %cond = select i1 %or0, i32 %or, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_or_eq0_and_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_or_eq0_and_or(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 0
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 [[AND]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, 0
+  %and = and i32 %x, %y
+  %cond = select i1 %or0, i32 %and, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_or_eq0_xor_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_or_eq0_xor_or(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 [[XOR]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, 0
+  %xor = xor i32 %x, %y
+  %cond = select i1 %or0, i32 %xor, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_and_ne0_xor_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_and_ne0_xor_or(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0_NOT:%.*]] = icmp eq i32 [[OR]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0_NOT]], i32 0, i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp ne i32 %or, 0
+  %xor = xor i32 %x, %y
+  %cond = select i1 %or0, i32 %xor, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_xor_eq0_xor_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_xor_eq0_xor_and(
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 0, i32 [[AND]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, 0
+  %and = and i32 %x, %y
+  %cond = select i1 %xor0, i32 %xor, i32 %and
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_xor_eq0_xor_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_xor_eq0_xor_or(
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 0, i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, 0
+  %or = or i32 %x, %y
+  %cond = select i1 %xor0, i32 %xor, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_xor_eq0_and_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_xor_eq0_and_xor(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[XOR]], 0
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 [[AND]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, 0
+  %and = and i32 %x, %y
+  %cond = select i1 %xor0, i32 %and, i32 %xor
+  ret i32 %cond
+}
+
+; https://alive2.llvm.org/ce/z/SBe8ei
+define i32 @src_no_trans_select_xor_eq0_or_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_xor_eq0_or_xor(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[XOR]], 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 [[OR]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, 0
+  %or = or i32 %x, %y
+  %cond = select i1 %xor0, i32 %or, i32 %xor
+  ret i32 %cond
+}
+
 ; (X == C) ? X : Y -> (X == C) ? C : Y
 ; Fixed #77553
 define i32 @src_select_xxory_eq0_xorxy_y(i32 %x, i32 %y) {
diff --git a/llvm/test/Transforms/InstCombine/zext-or-icmp.ll b/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
index 661c360..a4b74aa 100644
--- a/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
+++ b/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -passes='instcombine<no-verify-fixpoint>' -S | FileCheck %s
 
 define i8 @zext_or_icmp_icmp(i8 %a, i8 %b) {
 ; CHECK-LABEL: @zext_or_icmp_icmp(
@@ -180,11 +180,11 @@ define i8 @PR49475_infloop(i32 %t0, i16 %insert, i64 %e, i8 %i162) {
 ; CHECK-NEXT:    [[SEXT:%.*]] = shl i64 [[SUB17]], 32
 ; CHECK-NEXT:    [[CONV18:%.*]] = ashr exact i64 [[SEXT]], 32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i64 [[XOR]], [[CONV18]]
-; CHECK-NEXT:    [[CONV19:%.*]] = zext i1 [[CMP]] to i16
-; CHECK-NEXT:    [[OR21:%.*]] = or i16 [[CONV19]], [[INSERT]]
-; CHECK-NEXT:    [[TOBOOL23_NOT:%.*]] = icmp eq i16 [[OR21]], 0
+; CHECK-NEXT:    [[TRUNC44:%.*]] = zext i1 [[CMP]] to i8
+; CHECK-NEXT:    [[INC:%.*]] = add i8 [[TRUNC44]], [[I162]]
+; CHECK-NEXT:    [[TOBOOL23_NOT:%.*]] = xor i1 [[CMP]], true
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[TOBOOL23_NOT]])
-; CHECK-NEXT:    ret i8 [[I162]]
+; CHECK-NEXT:    ret i8 [[INC]]
 ;
   %b = icmp eq i32 %t0, 0
   %b2 = icmp eq i16 %insert, 0
diff --git a/llvm/test/Transforms/InstSimplify/implies.ll b/llvm/test/Transforms/InstSimplify/implies.ll
index b70dc90..7e3cb65 100644
--- a/llvm/test/Transforms/InstSimplify/implies.ll
+++ b/llvm/test/Transforms/InstSimplify/implies.ll
@@ -155,7 +155,13 @@ define i1 @test9(i32 %length.i, i32 %i) {
 
 define i1 @test10(i32 %length.i, i32 %x.full) {
 ; CHECK-LABEL: @test10(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[X:%.*]] = and i32 [[X_FULL:%.*]], -65536
+; CHECK-NEXT:    [[LARGE:%.*]] = or i32 [[X]], 100
+; CHECK-NEXT:    [[SMALL:%.*]] = or i32 [[X]], 90
+; CHECK-NEXT:    [[KNOWN:%.*]] = icmp ult i32 [[LARGE]], [[LENGTH_I:%.*]]
+; CHECK-NEXT:    [[TO_PROVE:%.*]] = icmp ult i32 [[SMALL]], [[LENGTH_I]]
+; CHECK-NEXT:    [[RES:%.*]] = icmp ule i1 [[KNOWN]], [[TO_PROVE]]
+; CHECK-NEXT:    ret i1 [[RES]]
 ;
   %x = and i32 %x.full, 4294901760  ;; 4294901760 == 0xffff0000
   %large = or i32 %x, 100
@@ -166,6 +172,19 @@ define i1 @test10(i32 %length.i, i32 %x.full) {
   ret i1 %res
 }
 
+define i1 @test10_with_disjoint(i32 %length.i, i32 %x.full) {
+; CHECK-LABEL: @test10_with_disjoint(
+; CHECK-NEXT:    ret i1 true
+;
+  %x = and i32 %x.full, 4294901760  ;; 4294901760 == 0xffff0000
+  %large = or disjoint i32 %x, 100
+  %small = or disjoint i32 %x, 90
+  %known = icmp ult i32 %large, %length.i
+  %to.prove = icmp ult i32 %small, %length.i
+  %res = icmp ule i1 %known, %to.prove
+  ret i1 %res
+}
+
 define i1 @test11(i32 %length.i, i32 %x) {
 ; CHECK-LABEL: @test11(
 ; CHECK-NEXT:    [[LARGE:%.*]] = or i32 [[X:%.*]], 100
@@ -216,7 +235,13 @@ define i1 @test13(i32 %length.i, i32 %x) {
 
 define i1 @test14(i32 %length.i, i32 %x.full) {
 ; CHECK-LABEL: @test14(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[X:%.*]] = and i32 [[X_FULL:%.*]], -61681
+; CHECK-NEXT:    [[LARGE:%.*]] = or i32 [[X]], 8224
+; CHECK-NEXT:    [[SMALL:%.*]] = or i32 [[X]], 4112
+; CHECK-NEXT:    [[KNOWN:%.*]] = icmp ult i32 [[LARGE]], [[LENGTH_I:%.*]]
+; CHECK-NEXT:    [[TO_PROVE:%.*]] = icmp ult i32 [[SMALL]], [[LENGTH_I]]
+; CHECK-NEXT:    [[RES:%.*]] = icmp ule i1 [[KNOWN]], [[TO_PROVE]]
+; CHECK-NEXT:    ret i1 [[RES]]
 ;
   %x = and i32 %x.full, 4294905615  ;; 4294905615 == 0xffff0f0f
   %large = or i32 %x, 8224 ;; == 0x2020
@@ -227,6 +252,19 @@ define i1 @test14(i32 %length.i, i32 %x.full) {
   ret i1 %res
 }
 
+define i1 @test14_with_disjoint(i32 %length.i, i32 %x.full) {
+; CHECK-LABEL: @test14_with_disjoint(
+; CHECK-NEXT:    ret i1 true
+;
+  %x = and i32 %x.full, 4294905615  ;; 4294905615 == 0xffff0f0f
+  %large = or disjoint i32 %x, 8224 ;; == 0x2020
+  %small = or disjoint i32 %x, 4112 ;; == 0x1010
+  %known = icmp ult i32 %large, %length.i
+  %to.prove = icmp ult i32 %small, %length.i
+  %res = icmp ule i1 %known, %to.prove
+  ret i1 %res
+}
+
 define i1 @test15(i32 %length.i, i32 %x) {
 ; CHECK-LABEL: @test15(
 ; CHECK-NEXT:    [[LARGE:%.*]] = add nuw i32 [[X:%.*]], 100
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index 3e895edc..afd49aa 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -43,9 +43,8 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP16]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP18:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -135,9 +134,8 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP16]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP18:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-force-tail-with-evl.ll
new file mode 100644
index 0000000..2ce2a45
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-force-tail-with-evl.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=pwr10 -S < %s | FileCheck %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=pwr10 -S < %s | FileCheck %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
new file mode 100644
index 0000000..5d1a471
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
@@ -0,0 +1,117 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=pwr10 -disable-output < %s 2>&1 | FileCheck %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; CHECK-LABEL: VPlan 'Initial VPlan for VF={2,4},UF>=1' {
+; CHECK-NEXT: Live-in vp<%0> = VF * UF
+; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT: Live-in vp<%2> = backedge-taken count
+; CHECK-NEXT: Live-in ir<%N> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%16>
+; CHECK-NEXT:     WIDEN-INDUCTION %iv = phi 0, %iv.next, ir<1>
+; CHECK-NEXT:     EMIT vp<%5> = icmp ule ir<%iv>, vp<%2>
+; CHECK-NEXT:   Successor(s): pred.store
+; CHECK-EMPTY:
+; CHECK-NEXT:  <xVFxUF> pred.store: {
+; CHECK-NEXT:    pred.store.entry:
+; CHECK-NEXT:      BRANCH-ON-MASK vp<%5>
+; CHECK-NEXT:    Successor(s): pred.store.if, pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:    pred.store.if:
+; CHECK-NEXT:      vp<%6> = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK-NEXT:      REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<%6>
+; CHECK-NEXT:      REPLICATE ir<%0> = load ir<%arrayidx>
+; CHECK-NEXT:      REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<%6>
+; CHECK-NEXT:      REPLICATE ir<%1> = load ir<%arrayidx2>
+; CHECK-NEXT:      REPLICATE ir<%arrayidx4> = getelementptr inbounds ir<%a>, vp<%6>
+; CHECK-NEXT:      REPLICATE ir<%add> = add nsw ir<%1>, ir<%0>
+; CHECK-NEXT:      REPLICATE store ir<%add>, ir<%arrayidx4>
+; CHECK-NEXT:    Successor(s): pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:    pred.store.continue:
+; CHECK-NEXT:      PHI-PREDICATED-INSTRUCTION vp<%14> = ir<%0>
+; CHECK-NEXT:      PHI-PREDICATED-INSTRUCTION vp<%15> = ir<%1>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): for.body.2
+; CHECK-EMPTY:
+; CHECK-NEXT:  for.body.2:
+; CHECK-NEXT:     EMIT vp<%16> = add vp<%3>, vp<%0>
+; CHECK-NEXT:     EMIT branch-on-count vp<%16>, vp<%1>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @safe_dep(ptr %p) {
+; CHECK-LABEL: VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK-NEXT: Live-in vp<%0> = VF * UF
+; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT: Live-in ir<512> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%10>
+; CHECK-NEXT:     vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK-NEXT:     CLONE ir<%a1> = getelementptr ir<%p>, vp<%3>
+; CHECK-NEXT:     vp<%5> = vector-pointer ir<%a1>
+; CHECK-NEXT:     WIDEN ir<%v> = load vp<%5>
+; CHECK-NEXT:     CLONE ir<%offset> = add vp<%3>, ir<100>
+; CHECK-NEXT:     CLONE ir<%a2> = getelementptr ir<%p>, ir<%offset>
+; CHECK-NEXT:     vp<%9> = vector-pointer ir<%a2>
+; CHECK-NEXT:     WIDEN store vp<%9>, ir<%v>
+; CHECK-NEXT:     EMIT vp<%10> = add nuw vp<%2>, vp<%0>
+; CHECK-NEXT:     EMIT branch-on-count vp<%10>, vp<%1>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 32
+  %offset = add i64 %iv, 100
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 32
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 511
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index 57e1dc9..b876e9d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -1,11 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v,+d -passes=loop-vectorize < %s -S -o - | FileCheck %s -check-prefix=OUTLOOP
 ; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v,+d -passes=loop-vectorize -prefer-inloop-reductions < %s -S -o - | FileCheck %s -check-prefix=INLOOP
-
+; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
 
 target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
 target triple = "riscv64"
 
+; FIXME: inloop reductions are not supported yet with predicated vectorization.
+
 define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; OUTLOOP-LABEL: @add_i16_i32(
 ; OUTLOOP-NEXT:  entry:
@@ -115,6 +117,70 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; INLOOP-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; INLOOP-NEXT:    ret i32 [[R_0_LCSSA]]
 ;
+; IF-EVL-LABEL: @add_i16_i32(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; IF-EVL-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; IF-EVL:       for.body.preheader:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 4
+; IF-EVL-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP4]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 4
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[INDEX]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; IF-EVL-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = icmp ule <vscale x 4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP12]], i32 2, <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i16> poison)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
+; IF-EVL-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP13]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i32> [[TMP14]], <vscale x 4 x i32> [[VEC_PHI]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP6]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP15]])
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; IF-EVL-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
+; IF-EVL-NEXT:    [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
+; IF-EVL-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
+; IF-EVL-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.cond.cleanup.loopexit:
+; IF-EVL-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; IF-EVL-NEXT:    ret i32 [[R_0_LCSSA]]
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.cond.cleanup
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll
new file mode 100644
index 0000000..835ff37
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll
@@ -0,0 +1,116 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP
+
+define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %index, i64 %n) {
+; IF-EVL-LABEL: @gather_scatter(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP11]], zeroinitializer
+; IF-EVL-NEXT:    [[TMP13:%.*]] = mul <vscale x 2 x i64> [[TMP12]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP13]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; IF-EVL-NEXT:    [[TMP16:%.*]] = mul i64 1, [[TMP15]]
+; IF-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP16]], i64 0
+; IF-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP17]], i32 2, i1 true)
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp ule <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], <vscale x 2 x i64> [[VEC_IND]]
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.vp.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP20]], <vscale x 2 x i1> [[TMP19]], i32 [[TMP18]])
+; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]]
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x float> @llvm.vp.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP21]], <vscale x 2 x i1> [[TMP19]], i32 [[TMP18]])
+; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]]
+; IF-EVL-NEXT:    call void @llvm.vp.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> [[WIDE_MASKED_GATHER2]], <vscale x 2 x ptr> align 4 [[TMP22]], <vscale x 2 x i1> [[TMP19]], i32 [[TMP18]])
+; IF-EVL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP18]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX1]], [[TMP10]]
+; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX]], i64 [[INDVARS_IV]]
+; IF-EVL-NEXT:    [[TMP25:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
+; IF-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[TMP25]]
+; IF-EVL-NEXT:    [[TMP26:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP25]]
+; IF-EVL-NEXT:    store float [[TMP26]], ptr [[ARRAYIDX7]], align 4
+; IF-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.end:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @gather_scatter(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], i64 [[INDVARS_IV]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
+; NO-VP-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    store float [[TMP1]], ptr [[ARRAYIDX7]], align 4
+; NO-VP-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; NO-VP:       for.end:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx3 = getelementptr inbounds i32, ptr %index, i64 %indvars.iv
+  %0 = load i64, ptr %arrayidx3, align 8
+  %arrayidx5 = getelementptr inbounds float, ptr %in, i64 %0
+  %1 = load float, ptr %arrayidx5, align 4
+  %arrayidx7 = getelementptr inbounds float, ptr %out, i64 %0
+  store float %1, ptr %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
new file mode 100644
index 0000000..0b495bc
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=NO-VP %s
+
+; FIXME: interleaved accesses are not supported yet with predicated vectorization.
+define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
+; IF-EVL-LABEL: @interleave(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[TMP17:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP31]], 8
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP17]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; IF-EVL-NEXT:    [[TMP32:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP32]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i64> [[TMP11]], zeroinitializer
+; IF-EVL-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i64> [[TMP12]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP13]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; IF-EVL-NEXT:    [[TMP37:%.*]] = mul i64 1, [[TMP15]]
+; IF-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP37]], i64 0
+; IF-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 4
+; IF-EVL-NEXT:    [[TMP38:%.*]] = add i64 [[TMP19]], 0
+; IF-EVL-NEXT:    [[TMP39:%.*]] = mul i64 [[TMP38]], 1
+; IF-EVL-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], [[TMP39]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = icmp ule <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
+; IF-EVL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP25]], i32 4, <vscale x 4 x i1> [[TMP23]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP26]], i32 4, <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[VEC_IND]], i32 1
+; IF-EVL-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 1
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP27]], i32 4, <vscale x 4 x i1> [[TMP23]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP28]], i32 4, <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[TMP29:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER3]], [[WIDE_MASKED_GATHER]]
+; IF-EVL-NEXT:    [[TMP30:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER2]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
+; IF-EVL-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; IF-EVL-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP35:%.*]] = mul i64 [[TMP34]], 4
+; IF-EVL-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP35]]
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP29]], ptr [[TMP33]], i32 4, <vscale x 4 x i1> [[TMP23]])
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP30]], ptr [[TMP36]], i32 4, <vscale x 4 x i1> [[TMP24]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[DOTSPLAT]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 0
+; IF-EVL-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 1
+; IF-EVL-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP21]]
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @interleave(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
+; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[TMP10]], i32 0
+; NO-VP-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[TMP1]], i32 0
+; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; NO-VP-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4
+; NO-VP-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
+; NO-VP-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; NO-VP-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; NO-VP-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; NO-VP-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; NO-VP-NEXT:    [[TMP6:%.*]] = add nsw <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = add nsw <8 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]]
+; NO-VP-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 8
+; NO-VP-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP12]], align 4
+; NO-VP-NEXT:    store <8 x i32> [[TMP7]], ptr [[TMP11]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; NO-VP-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 0
+; NO-VP-NEXT:    [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 1
+; NO-VP-NEXT:    [[TMP30:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP30]], [[TMP29]]
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [2 x i32], ptr %b, i64 %iv, i32 0
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [2 x i32], ptr %b, i64 %iv, i32 1
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
+
+for.cond.cleanup:
+  ret void
+}
+
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.interleave.count", i32 2}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll
new file mode 100644
index 0000000..d5ad99f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=NO-VP %s
+
+define void @iv32(ptr noalias %a, ptr noalias %b, i32 %N) {
+; IF-EVL-LABEL: @iv32(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[TMP19:%.*]] = sub i32 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP19]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP8]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = add i32 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP_LOAD]], ptr align 4 [[TMP17]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i32 [[TMP12]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[IV_NEXT]] = add i32 [[IV]], [[TMP10]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[IV_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY1:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV1]]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV1]]
+; IF-EVL-NEXT:    store i32 [[TMP0]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT1]] = add nuw nsw i32 [[IV1]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT1]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @iv32(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP0]], 4
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], [[TMP10]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP1]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP11]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP2]], 4
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP3]]
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP3]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD]], ptr [[TMP7]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP12]]
+; NO-VP-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]]
+; NO-VP-NEXT:    store i32 [[TMP9]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i32 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i32 %iv
+  store i32 %0, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exitcond.not = icmp eq i32 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
new file mode 100644
index 0000000..203d0c9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP
+
+define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) {
+; IF-EVL-LABEL: @masked_loadstore(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP15]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp ne <vscale x 4 x i32> [[VP_OP_LOAD]], zeroinitializer
+; IF-EVL-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i1> zeroinitializer
+; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP21]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD3:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP22]], <vscale x 4 x i1> [[TMP20]], i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP23:%.*]] = add <vscale x 4 x i32> [[VP_OP_LOAD]], [[VP_OP_LOAD3]]
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP23]], ptr align 4 [[TMP22]], <vscale x 4 x i1> [[TMP20]], i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP24]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_011]]
+; IF-EVL-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[TMP26]], 0
+; IF-EVL-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; IF-EVL:       if.then:
+; IF-EVL-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_011]]
+; IF-EVL-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add i32 [[TMP26]], [[TMP27]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4
+; IF-EVL-NEXT:    br label [[FOR_INC]]
+; IF-EVL:       for.inc:
+; IF-EVL-NEXT:    [[INC]] = add nuw nsw i64 [[I_011]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       exit:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @masked_loadstore(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[I_011]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[TMP0]], 0
+; NO-VP-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; NO-VP:       if.then:
+; NO-VP-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[I_011]]
+; NO-VP-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add i32 [[TMP0]], [[TMP1]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4
+; NO-VP-NEXT:    br label [[FOR_INC]]
+; NO-VP:       for.inc:
+; NO-VP-NEXT:    [[INC]] = add nuw nsw i64 [[I_011]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N:%.*]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; NO-VP:       exit:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %i.011 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %i.011
+  %0 = load i32, ptr %arrayidx, align 4
+  %cmp1 = icmp ne i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx3 = getelementptr inbounds i32, ptr %a, i64 %i.011
+  %1 = load i32, ptr %arrayidx3, align 4
+  %add = add i32 %0, %1
+  store i32 %add, ptr %arrayidx3, align 4
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i64 %i.011, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-no-masking.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-no-masking.ll
new file mode 100644
index 0000000..1c49fba
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-no-masking.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s
+
+; No need to emit predicated vector code if the vector instructions with masking are not required.
+define i32 @no_masking() {
+; CHECK-LABEL: @no_masking(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BODY:%.*]]
+; CHECK:       body:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[BODY]] ]
+; CHECK-NEXT:    [[INC]] = add i32 [[P]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[BODY]]
+; CHECK:       end:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %body
+
+body:
+  %p = phi i32 [ 1, %entry ], [ %inc, %body ]
+  %inc = add i32 %p, 1
+  %cmp = icmp eq i32 %inc, 0
+  br i1 %cmp, label %end, label %body
+
+end:
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
new file mode 100644
index 0000000..f2222e0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP
+
+; FIXME: reversed loads/stores are not supported yet with predicated vectorization.
+define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %ptr2) {
+; IF-EVL-LABEL: @reverse_load_store(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; IF-EVL-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
+; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[INDEX]]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1023, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; IF-EVL-NEXT:    [[TMP11:%.*]] = add i64 [[TMP7]], -1
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; IF-EVL-NEXT:    [[TMP15:%.*]] = mul i64 0, [[TMP14]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = sub i64 1, [[TMP14]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP15]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]]
+; IF-EVL-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[REVERSE]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]])
+; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
+; IF-EVL-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP21]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = sub i64 1, [[TMP21]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP22]]
+; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
+; IF-EVL-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; IF-EVL-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[REVERSE3]])
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[REVERSE5]], ptr [[TMP25]], i32 4, <vscale x 4 x i1> [[REVERSE4]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
+; IF-EVL-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ADD]] = add i64 [[ADD_PHI]], -1
+; IF-EVL-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[ADD]]
+; IF-EVL-NEXT:    [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4
+; IF-EVL-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2]], i64 [[ADD]]
+; IF-EVL-NEXT:    store i32 [[TMP]], ptr [[GEPS]], align 4
+; IF-EVL-NEXT:    [[INC]] = add i32 [[I]], 1
+; IF-EVL-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 1024
+; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       loopend:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @reverse_load_store(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[STARTVAL:%.*]], [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ADD]] = add i64 [[ADD_PHI]], -1
+; NO-VP-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[ADD]]
+; NO-VP-NEXT:    [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4
+; NO-VP-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[ADD]]
+; NO-VP-NEXT:    store i32 [[TMP]], ptr [[GEPS]], align 4
+; NO-VP-NEXT:    [[INC]] = add i32 [[I]], 1
+; NO-VP-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 1024
+; NO-VP-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND:%.*]]
+; NO-VP:       loopend:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %add.phi = phi i64 [ %startval, %entry ], [ %add, %for.body ]
+  %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %add = add i64 %add.phi, -1
+  %gepl = getelementptr inbounds i32, ptr %ptr, i64 %add
+  %tmp = load i32, ptr %gepl, align 4
+  %geps = getelementptr inbounds i32, ptr %ptr2, i64 %add
+  store i32 %tmp, ptr %geps, align 4
+  %inc = add i32 %i, 1
+  %exitcond = icmp ne i32 %inc, 1024
+  br i1 %exitcond, label %for.body, label %loopend
+
+loopend:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
new file mode 100644
index 0000000..c69bb17
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=NO-VP %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL-LABEL: @foo(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP18:%.*]] = add nsw <vscale x 4 x i32> [[VP_OP_LOAD1]], [[VP_OP_LOAD]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP24]], [[TMP23]]
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @foo(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP10]], align 4
+; NO-VP-NEXT:    [[TMP11:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP11]], ptr [[TMP13]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]]
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
new file mode 100644
index 0000000..72b881b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
@@ -0,0 +1,134 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=IF-EVL,CHECK %s
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP,CHECK %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
+; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
+; IF-EVL-EMPTY:
+; IF-EVL:      vector.ph:
+; IF-EVL-NEXT: Successor(s): vector loop
+; IF-EVL-EMPTY:
+; IF-EVL-NEXT: <x1> vector loop: {
+; IF-EVL-NEXT:  vector.body:
+; IF-EVL-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; IF-EVL-NEXT:    EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]>
+; IF-EVL-NEXT:    EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[EVL_PHI]]>, ir<%N>
+; IF-EVL-NEXT:    vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
+; IF-EVL-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; IF-EVL-NEXT:    WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>, ir<true>
+; IF-EVL-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; IF-EVL-NEXT:    WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]>, ir<true>
+; IF-EVL-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; IF-EVL-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
+; IF-EVL-NEXT:    WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>, ir<true>
+; IF-EVL-NEXT:    SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+; IF-EVL-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
+; IF-EVL-NEXT:  No successors
+; IF-EVL-NEXT: }
+
+; NO-VP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
+; NO-VP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; NO-VP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; NO-VP-NEXT: Live-in ir<%N> = original trip-count
+; NO-VP-EMPTY:
+; NO-VP:      vector.ph:
+; NO-VP-NEXT: Successor(s): vector loop
+; NO-VP-EMPTY:
+; NO-VP-NEXT: <x1> vector loop: {
+; NO-VP-NEXT:  vector.body:
+; NO-VP-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; NO-VP-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; NO-VP-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; NO-VP-NEXT:    WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; NO-VP-NEXT:    WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]>
+; NO-VP-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
+; NO-VP-NEXT:    WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>
+; NO-VP-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
+; NO-VP-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; NO-VP-NEXT:  No successors
+; NO-VP-NEXT: }
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @safe_dep(ptr %p) {
+; CHECK: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<512> = original trip-count
+; CHECK-EMPTY:
+; CHECK:      vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:  vector.body:
+; CHECK-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; CHECK-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; CHECK-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr ir<%p>, vp<[[ST]]>
+; CHECK-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; CHECK-NEXT:    WIDEN ir<[[V:%.+]]> = load vp<[[PTR1]]>
+; CHECK-NEXT:    CLONE ir<[[OFFSET:.+]]> = add vp<[[ST]]>, ir<100>
+; CHECK-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr ir<%p>, ir<[[OFFSET]]>
+; CHECK-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; CHECK-NEXT:    WIDEN store vp<[[PTR2]]>, ir<[[V]]>
+; CHECK-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
+; CHECK-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT:  No successors
+; CHECK-NEXT: }
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 32
+  %offset = add i64 %iv, 100
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 32
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 511
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
new file mode 100644
index 0000000..1cf71360
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=x86_64 -mattr=+avx512f -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=x86_64 -mattr=+avx512f -S < %s 2>&1 | FileCheck --check-prefix=NO-VP %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL-LABEL: @foo(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 15
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+; IF-EVL-NEXT:    [[TMP1:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP3]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
+; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP5]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
+; IF-EVL-NEXT:    [[TMP6:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; IF-EVL-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <16 x i1> [[TMP1]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; IF-EVL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP10]]
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @foo(
+; NO-VP-NEXT:  iter.check:
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; NO-VP:       vector.main.loop.iter.check:
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 64
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 64
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 16
+; NO-VP-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 32
+; NO-VP-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 48
+; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
+; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP2]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 16
+; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 32
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 48
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i32>, ptr [[TMP9]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i32>, ptr [[TMP10]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i32>, ptr [[TMP11]], align 4
+; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP1]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP2]]
+; NO-VP-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; NO-VP-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 16
+; NO-VP-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 32
+; NO-VP-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 48
+; NO-VP-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr [[TMP16]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr [[TMP17]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP18]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i32>, ptr [[TMP19]], align 4
+; NO-VP-NEXT:    [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_LOAD5]], [[WIDE_LOAD]]
+; NO-VP-NEXT:    [[TMP21:%.*]] = add nsw <16 x i32> [[WIDE_LOAD6]], [[WIDE_LOAD2]]
+; NO-VP-NEXT:    [[TMP22:%.*]] = add nsw <16 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD3]]
+; NO-VP-NEXT:    [[TMP23:%.*]] = add nsw <16 x i32> [[WIDE_LOAD8]], [[WIDE_LOAD4]]
+; NO-VP-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
+; NO-VP-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]]
+; NO-VP-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0
+; NO-VP-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 16
+; NO-VP-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 32
+; NO-VP-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 48
+; NO-VP-NEXT:    store <16 x i32> [[TMP20]], ptr [[TMP28]], align 4
+; NO-VP-NEXT:    store <16 x i32> [[TMP21]], ptr [[TMP29]], align 4
+; NO-VP-NEXT:    store <16 x i32> [[TMP22]], ptr [[TMP30]], align 4
+; NO-VP-NEXT:    store <16 x i32> [[TMP23]], ptr [[TMP31]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
+; NO-VP-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; NO-VP:       vec.epilog.iter.check:
+; NO-VP-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; NO-VP-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; NO-VP:       vec.epilog.ph:
+; NO-VP-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; NO-VP-NEXT:    [[N_MOD_VF9:%.*]] = urem i64 [[N]], 8
+; NO-VP-NEXT:    [[N_VEC10:%.*]] = sub i64 [[N]], [[N_MOD_VF9]]
+; NO-VP-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; NO-VP:       vec.epilog.vector.body:
+; NO-VP-NEXT:    [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP33:%.*]] = add i64 [[INDEX12]], 0
+; NO-VP-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP33]]
+; NO-VP-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP34]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr [[TMP35]], align 4
+; NO-VP-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP33]]
+; NO-VP-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD14:%.*]] = load <8 x i32>, ptr [[TMP37]], align 4
+; NO-VP-NEXT:    [[TMP38:%.*]] = add nsw <8 x i32> [[WIDE_LOAD14]], [[WIDE_LOAD13]]
+; NO-VP-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP33]]
+; NO-VP-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 0
+; NO-VP-NEXT:    store <8 x i32> [[TMP38]], ptr [[TMP40]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 8
+; NO-VP-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC10]]
+; NO-VP-NEXT:    br i1 [[TMP41]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       vec.epilog.middle.block:
+; NO-VP-NEXT:    [[CMP_N11:%.*]] = icmp eq i64 [[N]], [[N_VEC10]]
+; NO-VP-NEXT:    br i1 [[CMP_N11]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_SCALAR_PH]]
+; NO-VP:       vec.epilog.scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP42:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP43]], [[TMP42]]
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll
new file mode 100644
index 0000000..9b49d44
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll
@@ -0,0 +1,89 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=x86_64 -mattr=+avx512f -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=x86_64 -mattr=+avx512f -disable-output < %s 2>&1 | FileCheck --check-prefix=NO-VP %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; IF-EVL-NEXT: Live-in vp<[[BETC:%[0-9]+]]> = backedge-taken count
+; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
+; IF-EVL-EMPTY:
+; IF-EVL:      vector.ph:
+; IF-EVL-NEXT: Successor(s): vector loop
+; IF-EVL-EMPTY:
+; IF-EVL-NEXT: <x1> vector loop: {
+; IF-EVL-NEXT:  vector.body:
+; IF-EVL-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; IF-EVL-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; IF-EVL-NEXT:    EMIT vp<[[VIV:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[IV]]>
+; IF-EVL-NEXT:    EMIT vp<[[MASK:%[0-9]+]]> = icmp ule vp<[[VIV]]>, vp<[[BETC]]>
+; IF-EVL-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; IF-EVL-NEXT:    WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>, vp<[[MASK]]>
+; IF-EVL-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; IF-EVL-NEXT:    WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]>, vp<[[MASK]]>
+; IF-EVL-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; IF-EVL-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
+; IF-EVL-NEXT:    WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[MASK]]>
+; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+; IF-EVL-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; IF-EVL-NEXT:  No successors
+; IF-EVL-NEXT: }
+
+; NO-VP: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; NO-VP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; NO-VP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; NO-VP-NEXT: Live-in ir<%N> = original trip-count
+; NO-VP-EMPTY:
+; NO-VP:      vector.ph:
+; NO-VP-NEXT: Successor(s): vector loop
+; NO-VP-EMPTY:
+; NO-VP-NEXT: <x1> vector loop: {
+; NO-VP-NEXT:  vector.body:
+; NO-VP-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; NO-VP-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; NO-VP-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; NO-VP-NEXT:    WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; NO-VP-NEXT:    WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]>
+; NO-VP-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
+; NO-VP-NEXT:    WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>
+; NO-VP-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
+; NO-VP-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; NO-VP-NEXT:  No successors
+; NO-VP-NEXT: }
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
new file mode 100644
index 0000000..0b87270
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
@@ -0,0 +1,222 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p loop-vectorize -force-vector-width=4 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=4 -S %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+define i32 @any_of_reduction_epilog(ptr %src, i64 %N) {
+; CHECK-LABEL: define i32 @any_of_reduction_epilog(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8]] = select <4 x i1> [[TMP4]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 1, i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_MERGE_RDX]], i64 0
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX5]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD7]], zeroinitializer
+; CHECK-NEXT:    [[TMP17]] = select <4 x i1> [[TMP13]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> [[VEC_PHI6]]
+; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <4 x i32> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
+; CHECK-NEXT:    [[RDX_SELECT9:%.*]] = select i1 [[TMP16]], i32 1, i32 0
+; CHECK-NEXT:    [[CMP_N4:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[CMP_N4]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX10:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_SELECT9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX10]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[SELECT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq i8 [[LOAD]], 0
+; CHECK-NEXT:    [[SELECT]] = select i1 [[ICMP]], i32 1, i32 [[RED]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[ICMP3:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[ICMP3]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[SELECT_LCSSA:%.*]] = phi i32 [ [[SELECT]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[RDX_SELECT9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[SELECT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %select, %loop ]
+  %gep = getelementptr inbounds i8, ptr %src, i64 %iv
+  %load = load i8, ptr %gep, align 1
+  %icmp = icmp eq i8 %load, 0
+  %select = select i1 %icmp, i32 1, i32 %red
+  %iv.next = add i64 %iv, 1
+  %icmp3 = icmp eq i64 %iv, %N
+  br i1 %icmp3, label %exit, label %loop
+
+exit:
+  ret i32 %select
+}
+
+
+define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
+; CHECK-LABEL: define i1 @any_of_reduction_i1_epilog(
+; CHECK-SAME: i64 [[N:%.*]], i32 [[A:%.*]]) {
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3]] = select <4 x i1> [[TMP1]], <4 x i1> [[VEC_PHI]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i1> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i1 false, i1 false
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[IND_END6:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i1 [ false, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
+; CHECK-NEXT:    [[IND_END5:%.*]] = trunc i64 [[N_VEC3]] to i32
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[BC_MERGE_RDX]], i64 0
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i1> [[MINMAX_IDENT_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT14:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT13]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI10:%.*]] = phi <4 x i1> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND11:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i32> [[VEC_IND11]], [[BROADCAST_SPLAT14]]
+; CHECK-NEXT:    [[TMP10]] = select <4 x i1> [[TMP8]], <4 x i1> [[VEC_PHI10]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX9]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT12]] = add <4 x i32> [[VEC_IND11]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[RDX_SELECT_CMP16:%.*]] = icmp ne <4 x i1> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP16]])
+; CHECK-NEXT:    [[RDX_SELECT16:%.*]] = select i1 [[TMP13]], i1 false, i1 false
+; CHECK-NEXT:    [[CMP_N8:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[CMP_N8]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL7:%.*]] = phi i32 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END6]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX17:%.*]] = phi i1 [ false, [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_SELECT16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL4]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RED_I1:%.*]] = phi i1 [ [[BC_MERGE_RDX17]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL7]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp eq i32 [[IV_2]], [[A]]
+; CHECK-NEXT:    [[SEL]] = select i1 [[CMP_1]], i1 [[RED_I1]], i1 false
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
+; CHECK-NEXT:    [[CMP_2:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_2]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[SEL_LCSSA:%.*]] = phi i1 [ [[SEL]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[RDX_SELECT16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i1 [[SEL_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red.i1 = phi i1 [ false, %entry ], [ %sel, %loop ]
+  %iv.2 = phi i32 [ 0, %entry ], [ %iv.2.next, %loop ]
+  %cmp.1 = icmp eq i32 %iv.2, %a
+  %sel = select i1 %cmp.1, i1 %red.i1, i1 false
+  %iv.next = add i64 %iv, 1
+  %iv.2.next = add i32 %iv.2, 1
+  %cmp.2 = icmp eq i64 %iv, %N
+  br i1 %cmp.2, label %exit, label %loop
+
+exit:
+  ret i1 %sel
+
+; uselistorder directives
+  uselistorder i1 %sel, { 1, 0 }
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll
new file mode 100644
index 0000000..a90b38c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -force-target-supports-scalable-vectors -scalable-vectorization=on -S < %s | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-vector-width=4 \
+; RUN: -force-target-supports-scalable-vectors -scalable-vectorization=on -S < %s | FileCheck --check-prefix=NO-VP %s
+
+; The target does not support predicated vectorization.
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL-LABEL: @foo(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @foo(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP8]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP1]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP14]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
+; NO-VP-NEXT:    [[TMP16:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP16]], ptr [[TMP10]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; NO-VP-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/vplan-force-tail-with-evl.ll
new file mode 100644
index 0000000..f510d47
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vplan-force-tail-with-evl.ll
@@ -0,0 +1,37 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl -force-vector-width=4 \
+; RUN: -force-target-supports-scalable-vectors -scalable-vectorization=on \
+; RUN: -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP %s
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -force-target-supports-scalable-vectors -scalable-vectorization=on \
+; RUN: -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP %s
+
+; The target does not support predicated vectorization.
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; NO-VP-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/PGOProfile/vtable_profile.ll b/llvm/test/Transforms/PGOProfile/vtable_profile.ll
index a844003..aae1e2d 100644
--- a/llvm/test/Transforms/PGOProfile/vtable_profile.ll
+++ b/llvm/test/Transforms/PGOProfile/vtable_profile.ll
@@ -1,9 +1,6 @@
 ; RUN: opt < %s -passes=pgo-instr-gen -enable-vtable-value-profiling -S 2>&1 | FileCheck %s --check-prefix=GEN --implicit-check-not="VTable value profiling is presently not supported"
 ; RUN: opt < %s -passes=pgo-instr-gen,instrprof -enable-vtable-value-profiling -S 2>&1 | FileCheck %s --check-prefix=LOWER --implicit-check-not="VTable value profiling is presently not supported"
 
-; __llvm_prf_vnm stores zlib-compressed vtable names.
-; REQUIRES: zlib
-
 source_filename = "vtable_local.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -59,7 +56,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; LOWER: $"__profvt_vtable_local.ll;_ZTVN12_GLOBAL__N_15Base2E" = comdat nodeduplicate
 ; LOWER: @__profvt__ZTV7Derived = global { i64, ptr, i32 } { i64 -4576307468236080025, ptr @_ZTV7Derived, i32 48 }, section "__llvm_prf_vtab", comdat, align 8
 ; LOWER: @"__profvt_vtable_local.ll;_ZTVN12_GLOBAL__N_15Base2E" = internal global { i64, ptr, i32 } { i64 1419990121885302679, ptr @_ZTVN12_GLOBAL__N_15Base2E, i32 24 }, section "__llvm_prf_vtab", comdat, align 8
-; LOWER: @__llvm_prf_vnm = private constant [64 x i8] c"7>x\DA\8B\8F\0A\093wI-\CA,KMa,+IL\CAI\8D\CF\C9ON\CC\D1\CB\C9\B1\8E\07J\FA\19\1A\C5\BB\FB\F8;9\FA\C4\C7\FB\C5\1B\9A:%\16\A7\1A\B9\02\00\19:\12o", section "__llvm_prf_vns", align 1
+; LOWER: @__llvm_prf_vnm = private constant {{.*}}, section "__llvm_prf_vns", align 1
 ; LOWER: @llvm.used = appending global [5 x ptr] [ptr @__profvt__ZTV7Derived, ptr @"__profvt_vtable_local.ll;_ZTVN12_GLOBAL__N_15Base2E", ptr @__llvm_prf_vnodes, ptr @__llvm_prf_nm, ptr @__llvm_prf_vnm], section "llvm.metadata"
 
 define i32 @_Z4funci(i32 %a) {
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll
index 495ec0a..45e411d 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll
@@ -9,11 +9,7 @@ define <4 x i64> @PR67803(<4 x i64> %x, <4 x i64> %y, <4 x i64> %a, <4 x i64> %b
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i64> [[X:%.*]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[Y:%.*]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[CMP_I21:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[SEXT_I22:%.*]] = sext <4 x i1> [[CMP_I21]] to <4 x i32>
-; CHECK-NEXT:    [[CMP_I:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[SEXT_I22]], <4 x i32> [[SEXT_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <32 x i8> [[TMP5]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
diff --git a/llvm/test/Transforms/RemoveTraps/remove-traps.ll b/llvm/test/Transforms/RemoveTraps/remove-traps.ll
index 80b86e0..c8d5fec 100644
--- a/llvm/test/Transforms/RemoveTraps/remove-traps.ll
+++ b/llvm/test/Transforms/RemoveTraps/remove-traps.ll
@@ -1,18 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes='function(remove-traps)' -S | FileCheck %s --check-prefixes=NOPROFILE
 ; RUN: opt < %s -passes='function(remove-traps)' -remove-traps-random-rate=1 -S | FileCheck %s --check-prefixes=ALL
-; RUN: opt < %s -passes='require<profile-summary>,function(remove-traps)' -S | FileCheck %s --check-prefixes=HOT
+; RUN: opt < %s -passes='require<profile-summary>,function(remove-traps)' -remove-traps-percentile-cutoff-hot=990000 -S | FileCheck %s --check-prefixes=HOT99
 ; RUN: opt < %s -passes='require<profile-summary>,function(remove-traps)' -remove-traps-percentile-cutoff-hot=700000 -S | FileCheck %s --check-prefixes=HOT70
 
 target triple = "x86_64-pc-linux-gnu"
 
 declare void @llvm.ubsantrap(i8 immarg)
+declare i1 @llvm.allow.ubsan.check(i8 immarg)
 
 define dso_local noundef i32 @simple(ptr noundef readonly %0) {
 ; NOPROFILE-LABEL: define dso_local noundef i32 @simple(
 ; NOPROFILE-SAME: ptr noundef readonly [[TMP0:%.*]]) {
 ; NOPROFILE-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; NOPROFILE-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], false
+; NOPROFILE-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; NOPROFILE-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; NOPROFILE-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; NOPROFILE:       3:
 ; NOPROFILE-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -24,7 +26,8 @@ define dso_local noundef i32 @simple(ptr noundef readonly %0) {
 ; ALL-LABEL: define dso_local noundef i32 @simple(
 ; ALL-SAME: ptr noundef readonly [[TMP0:%.*]]) {
 ; ALL-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; ALL-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], true
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; ALL-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; ALL-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; ALL:       3:
 ; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -33,22 +36,24 @@ define dso_local noundef i32 @simple(ptr noundef readonly %0) {
 ; ALL-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; ALL-NEXT:    ret i32 [[TMP5]]
 ;
-; HOT-LABEL: define dso_local noundef i32 @simple(
-; HOT-SAME: ptr noundef readonly [[TMP0:%.*]]) {
-; HOT-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], false
-; HOT-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
-; HOT:       3:
-; HOT-NEXT:    tail call void @llvm.ubsantrap(i8 22)
-; HOT-NEXT:    unreachable
-; HOT:       4:
-; HOT-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
-; HOT-NEXT:    ret i32 [[TMP5]]
+; HOT99-LABEL: define dso_local noundef i32 @simple(
+; HOT99-SAME: ptr noundef readonly [[TMP0:%.*]]) {
+; HOT99-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
+; HOT99-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT99-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
+; HOT99-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; HOT99:       3:
+; HOT99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; HOT99-NEXT:    unreachable
+; HOT99:       4:
+; HOT99-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; HOT99-NEXT:    ret i32 [[TMP5]]
 ;
 ; HOT70-LABEL: define dso_local noundef i32 @simple(
 ; HOT70-SAME: ptr noundef readonly [[TMP0:%.*]]) {
 ; HOT70-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT70-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], false
+; HOT70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT70-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; HOT70-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; HOT70:       3:
 ; HOT70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -58,7 +63,8 @@ define dso_local noundef i32 @simple(ptr noundef readonly %0) {
 ; HOT70-NEXT:    ret i32 [[TMP5]]
 ;
   %chk = icmp eq ptr %0, null
-  %hot = call i1 @llvm.experimental.hot()
+  %allow = call i1 @llvm.allow.ubsan.check(i8 22)
+  %hot = xor i1 %allow, true
   %2 = or i1 %chk, %hot
   br i1 %2, label %3, label %4
 
@@ -76,7 +82,8 @@ define dso_local noundef i32 @hot(ptr noundef readonly %0) !prof !36 {
 ; NOPROFILE-LABEL: define dso_local noundef i32 @hot(
 ; NOPROFILE-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
 ; NOPROFILE-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; NOPROFILE-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], false
+; NOPROFILE-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; NOPROFILE-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; NOPROFILE-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; NOPROFILE:       3:
 ; NOPROFILE-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -88,7 +95,8 @@ define dso_local noundef i32 @hot(ptr noundef readonly %0) !prof !36 {
 ; ALL-LABEL: define dso_local noundef i32 @hot(
 ; ALL-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
 ; ALL-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; ALL-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], true
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; ALL-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; ALL-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; ALL:       3:
 ; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -97,22 +105,24 @@ define dso_local noundef i32 @hot(ptr noundef readonly %0) !prof !36 {
 ; ALL-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; ALL-NEXT:    ret i32 [[TMP5]]
 ;
-; HOT-LABEL: define dso_local noundef i32 @hot(
-; HOT-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
-; HOT-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], true
-; HOT-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
-; HOT:       3:
-; HOT-NEXT:    tail call void @llvm.ubsantrap(i8 22)
-; HOT-NEXT:    unreachable
-; HOT:       4:
-; HOT-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
-; HOT-NEXT:    ret i32 [[TMP5]]
+; HOT99-LABEL: define dso_local noundef i32 @hot(
+; HOT99-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
+; HOT99-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
+; HOT99-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; HOT99-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
+; HOT99-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; HOT99:       3:
+; HOT99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; HOT99-NEXT:    unreachable
+; HOT99:       4:
+; HOT99-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; HOT99-NEXT:    ret i32 [[TMP5]]
 ;
 ; HOT70-LABEL: define dso_local noundef i32 @hot(
 ; HOT70-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
 ; HOT70-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT70-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], false
+; HOT70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT70-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; HOT70-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; HOT70:       3:
 ; HOT70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -122,7 +132,8 @@ define dso_local noundef i32 @hot(ptr noundef readonly %0) !prof !36 {
 ; HOT70-NEXT:    ret i32 [[TMP5]]
 ;
   %chk = icmp eq ptr %0, null
-  %hot = call i1 @llvm.experimental.hot()
+  %allow = call i1 @llvm.allow.ubsan.check(i8 22)
+  %hot = xor i1 %allow, true
   %2 = or i1 %chk, %hot
   br i1 %2, label %3, label %4
 
@@ -139,7 +150,8 @@ define dso_local noundef i32 @veryHot(ptr noundef readonly %0) !prof !39 {
 ; NOPROFILE-LABEL: define dso_local noundef i32 @veryHot(
 ; NOPROFILE-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
 ; NOPROFILE-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; NOPROFILE-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], false
+; NOPROFILE-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; NOPROFILE-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; NOPROFILE-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; NOPROFILE:       3:
 ; NOPROFILE-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -151,7 +163,8 @@ define dso_local noundef i32 @veryHot(ptr noundef readonly %0) !prof !39 {
 ; ALL-LABEL: define dso_local noundef i32 @veryHot(
 ; ALL-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
 ; ALL-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; ALL-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], true
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; ALL-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; ALL-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; ALL:       3:
 ; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -160,22 +173,24 @@ define dso_local noundef i32 @veryHot(ptr noundef readonly %0) !prof !39 {
 ; ALL-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; ALL-NEXT:    ret i32 [[TMP5]]
 ;
-; HOT-LABEL: define dso_local noundef i32 @veryHot(
-; HOT-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
-; HOT-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], true
-; HOT-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
-; HOT:       3:
-; HOT-NEXT:    tail call void @llvm.ubsantrap(i8 22)
-; HOT-NEXT:    unreachable
-; HOT:       4:
-; HOT-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
-; HOT-NEXT:    ret i32 [[TMP5]]
+; HOT99-LABEL: define dso_local noundef i32 @veryHot(
+; HOT99-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
+; HOT99-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
+; HOT99-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; HOT99-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
+; HOT99-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; HOT99:       3:
+; HOT99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; HOT99-NEXT:    unreachable
+; HOT99:       4:
+; HOT99-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; HOT99-NEXT:    ret i32 [[TMP5]]
 ;
 ; HOT70-LABEL: define dso_local noundef i32 @veryHot(
 ; HOT70-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
 ; HOT70-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT70-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], true
+; HOT70-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; HOT70-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; HOT70-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; HOT70:       3:
 ; HOT70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -185,7 +200,8 @@ define dso_local noundef i32 @veryHot(ptr noundef readonly %0) !prof !39 {
 ; HOT70-NEXT:    ret i32 [[TMP5]]
 ;
   %chk = icmp eq ptr %0, null
-  %hot = call i1 @llvm.experimental.hot()
+  %allow = call i1 @llvm.allow.ubsan.check(i8 22)
+  %hot = xor i1 %allow, true
   %2 = or i1 %chk, %hot
   br i1 %2, label %3, label %4
 
@@ -206,7 +222,8 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 ; NOPROFILE-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
 ; NOPROFILE:       4:
 ; NOPROFILE-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; NOPROFILE-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], false
+; NOPROFILE-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; NOPROFILE-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
 ; NOPROFILE-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; NOPROFILE:       6:
 ; NOPROFILE-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -224,7 +241,8 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 ; ALL-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
 ; ALL:       4:
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; ALL-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], true
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; ALL-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
 ; ALL-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; ALL:       6:
 ; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -236,23 +254,24 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 ; ALL-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
 ; ALL-NEXT:    ret i32 [[TMP10]]
 ;
-; HOT-LABEL: define dso_local noundef i32 @branchColdFnHot(
-; HOT-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF17]] {
-; HOT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
-; HOT-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
-; HOT:       4:
-; HOT-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; HOT-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], false
-; HOT-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; HOT:       6:
-; HOT-NEXT:    tail call void @llvm.ubsantrap(i8 22)
-; HOT-NEXT:    unreachable
-; HOT:       7:
-; HOT-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
-; HOT-NEXT:    br label [[TMP9]]
-; HOT:       9:
-; HOT-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
-; HOT-NEXT:    ret i32 [[TMP10]]
+; HOT99-LABEL: define dso_local noundef i32 @branchColdFnHot(
+; HOT99-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF17]] {
+; HOT99-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; HOT99-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
+; HOT99:       4:
+; HOT99-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
+; HOT99-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT99-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
+; HOT99-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; HOT99:       6:
+; HOT99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; HOT99-NEXT:    unreachable
+; HOT99:       7:
+; HOT99-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+; HOT99-NEXT:    br label [[TMP9]]
+; HOT99:       9:
+; HOT99-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
+; HOT99-NEXT:    ret i32 [[TMP10]]
 ;
 ; HOT70-LABEL: define dso_local noundef i32 @branchColdFnHot(
 ; HOT70-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF17]] {
@@ -260,7 +279,8 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 ; HOT70-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
 ; HOT70:       4:
 ; HOT70-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; HOT70-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], false
+; HOT70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT70-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
 ; HOT70-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; HOT70:       6:
 ; HOT70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -277,7 +297,8 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 
 4:
   %chk = icmp eq ptr %1, null
-  %hot = call i1 @llvm.experimental.hot()
+  %allow = call i1 @llvm.allow.ubsan.check(i8 22)
+  %hot = xor i1 %allow, true
   %5 = or i1 %chk, %hot
   br i1 %5, label %6, label %7
 
@@ -301,7 +322,8 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; NOPROFILE-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
 ; NOPROFILE:       4:
 ; NOPROFILE-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; NOPROFILE-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], false
+; NOPROFILE-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; NOPROFILE-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
 ; NOPROFILE-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; NOPROFILE:       6:
 ; NOPROFILE-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -319,7 +341,8 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; ALL-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
 ; ALL:       4:
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; ALL-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], true
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; ALL-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
 ; ALL-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; ALL:       6:
 ; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -331,23 +354,24 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; ALL-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
 ; ALL-NEXT:    ret i32 [[TMP10]]
 ;
-; HOT-LABEL: define dso_local noundef i32 @branchHotFnCold(
-; HOT-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF16]] {
-; HOT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
-; HOT-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
-; HOT:       4:
-; HOT-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; HOT-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], true
-; HOT-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; HOT:       6:
-; HOT-NEXT:    tail call void @llvm.ubsantrap(i8 22)
-; HOT-NEXT:    unreachable
-; HOT:       7:
-; HOT-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
-; HOT-NEXT:    br label [[TMP9]]
-; HOT:       9:
-; HOT-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
-; HOT-NEXT:    ret i32 [[TMP10]]
+; HOT99-LABEL: define dso_local noundef i32 @branchHotFnCold(
+; HOT99-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF16]] {
+; HOT99-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; HOT99-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
+; HOT99:       4:
+; HOT99-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
+; HOT99-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; HOT99-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
+; HOT99-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; HOT99:       6:
+; HOT99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; HOT99-NEXT:    unreachable
+; HOT99:       7:
+; HOT99-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+; HOT99-NEXT:    br label [[TMP9]]
+; HOT99:       9:
+; HOT99-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
+; HOT99-NEXT:    ret i32 [[TMP10]]
 ;
 ; HOT70-LABEL: define dso_local noundef i32 @branchHotFnCold(
 ; HOT70-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF16]] {
@@ -355,7 +379,8 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; HOT70-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
 ; HOT70:       4:
 ; HOT70-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; HOT70-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], false
+; HOT70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT70-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
 ; HOT70-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; HOT70:       6:
 ; HOT70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -372,7 +397,8 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 
 4:
   %chk = icmp eq ptr %1, null
-  %hot = call i1 @llvm.experimental.hot()
+  %allow = call i1 @llvm.allow.ubsan.check(i8 22)
+  %hot = xor i1 %allow, true
   %5 = or i1 %chk, %hot
   br i1 %5, label %6, label %7
 
@@ -424,10 +450,10 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; ALL: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
 ; ALL: [[PROF19]] = !{!"branch_weights", i32 1, i32 1000}
 ;.
-; HOT: [[PROF16]] = !{!"function_entry_count", i64 1000}
-; HOT: [[PROF17]] = !{!"function_entry_count", i64 7000}
-; HOT: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
-; HOT: [[PROF19]] = !{!"branch_weights", i32 1, i32 1000}
+; HOT99: [[PROF16]] = !{!"function_entry_count", i64 1000}
+; HOT99: [[PROF17]] = !{!"function_entry_count", i64 7000}
+; HOT99: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
+; HOT99: [[PROF19]] = !{!"branch_weights", i32 1, i32 1000}
 ;.
 ; HOT70: [[PROF16]] = !{!"function_entry_count", i64 1000}
 ; HOT70: [[PROF17]] = !{!"function_entry_count", i64 7000}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
index 89ea15d..e492596 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
@@ -142,17 +142,16 @@ define void @gather_2(ptr %mat1, float %0, float %1) {
 ; CHECK-LABEL: define void @gather_2(
 ; CHECK-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00)
-; CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float [[TMP0]], float 0.000000e+00)
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP6]], <2 x float> zeroinitializer)
 ; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00)
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul float [[TMP2]], 0.000000e+00
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul float [[TMP3]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul float [[TMP4]], 0.000000e+00
 ; CHECK-NEXT:    [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1
-; CHECK-NEXT:    [[ARRAYIDX2_I_I_I278:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 1
 ; CHECK-NEXT:    [[ARRAYIDX5_I_I_I280:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 2
-; CHECK-NEXT:    store float [[TMP5]], ptr [[ARRAYIDX163]], align 4
-; CHECK-NEXT:    store float [[TMP6]], ptr [[ARRAYIDX2_I_I_I278]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[ARRAYIDX163]], align 4
 ; CHECK-NEXT:    store float [[TMP7]], ptr [[ARRAYIDX5_I_I_I280]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -358,12 +357,12 @@ define void @reuse_shuffle_indices_cost_crash_2(ptr %bezt, float %0) {
 ; CHECK-NEXT:    [[FNEG:%.*]] = fmul float [[TMP0]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[FNEG]], float 0.000000e+00)
 ; CHECK-NEXT:    store float [[TMP1]], ptr [[BEZT]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[FNEG]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ARRAYIDX5_I:%.*]] = getelementptr float, ptr [[BEZT]], i64 1
-; CHECK-NEXT:    store float [[TMP2]], ptr [[ARRAYIDX5_I]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call float @llvm.fmuladd.f32(float [[FNEG]], float 0.000000e+00, float 0.000000e+00)
-; CHECK-NEXT:    [[ARRAYIDX8_I831:%.*]] = getelementptr float, ptr [[BEZT]], i64 2
-; CHECK-NEXT:    store float [[TMP3]], ptr [[ARRAYIDX8_I831]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[FNEG]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> [[TMP4]], <2 x float> zeroinitializer)
+; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[ARRAYIDX5_I]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/ext-alt-node-must-ext.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/ext-alt-node-must-ext.ll
new file mode 100644
index 0000000..979d0ea
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/ext-alt-node-must-ext.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=systemz-unknown -mcpu=z15 < %s -slp-threshold=-10 | FileCheck %s
+
+define i32 @test(ptr %0, ptr %1) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 32 to ptr), align 32
+; CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 32
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <2 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <2 x i1> [[TMP9]] to <2 x i8>
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <2 x i1> [[TMP9]] to <2 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i8> [[TMP16]], <2 x i8> [[TMP11]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i8> [[TMP12]], i32 0
+; CHECK-NEXT:    [[DOTNEG:%.*]] = sext i8 [[TMP13]] to i32
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i8> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i8 [[TMP15]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw i32 [[DOTNEG]], [[TMP8]]
+; CHECK-NEXT:    ret i32 [[TMP10]]
+;
+  %3 = load i64, ptr inttoptr (i64 32 to ptr), align 32
+  %4 = load ptr, ptr %1, align 8
+  %5 = getelementptr inbounds i8, ptr %4, i64 32
+  %6 = load i64, ptr %5, align 8
+  %7 = icmp ne i64 %3, 0
+  %8 = zext i1 %7 to i32
+  %9 = icmp ne i64 %6, 0
+  %.neg = sext i1 %9 to i32
+  %10 = add nsw i32 %.neg, %8
+  ret i32 %10
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll
new file mode 100644
index 0000000..84f7e21
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+@e = global i8 0
+@c = global i16 0
+@d = global i32 0
+
+define i8 @test() {
+; CHECK-LABEL: define i8 @test() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr @e, align 1
+; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @c, align 2
+; CHECK-NEXT:    [[CONV1:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[CONV]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 32769>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[CONV1]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP8]])
+; CHECK-NEXT:    [[CONV4_30:%.*]] = trunc i32 [[TMP11]] to i8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP4]], i32 7
+; CHECK-NEXT:    [[XOR_31:%.*]] = and i32 [[TMP13]], -2
+; CHECK-NEXT:    store i32 [[XOR_31]], ptr @d, align 4
+; CHECK-NEXT:    ret i8 [[CONV4_30]]
+;
+entry:
+  %0 = load i8, ptr @e, align 1
+  %conv = sext i8 %0 to i32
+  %1 = load i16, ptr @c, align 2
+  %conv1 = zext i16 %1 to i32
+  %or.16 = or i32 %conv, 1
+  %add.16 = add nsw i32 %or.16, %conv1
+  %or.18 = or i32 %conv, 1
+  %add.18 = add nsw i32 %or.18, %conv1
+  %conv4.181 = or i32 %add.16, %add.18
+  %or.20 = or i32 %conv, 1
+  %add.20 = add nsw i32 %or.20, %conv1
+  %conv4.202 = or i32 %conv4.181, %add.20
+  %or.22 = or i32 %conv, 1
+  %add.22 = add nsw i32 %or.22, %conv1
+  %conv4.223 = or i32 %conv4.202, %add.22
+  %or.24 = or i32 %conv, 1
+  %add.24 = add nsw i32 %or.24, %conv1
+  %conv4.244 = or i32 %conv4.223, %add.24
+  %or.26 = or i32 %conv, 1
+  %add.26 = add nsw i32 %or.26, %conv1
+  %conv4.265 = or i32 %conv4.244, %add.26
+  %or.28 = or i32 %conv, 1
+  %add.28 = add nsw i32 %or.28, %conv1
+  %conv4.286 = or i32 %conv4.265, %add.28
+  %or.30 = or i32 %conv, 32769
+  %add.30 = add nsw i32 %or.30, %conv1
+  %conv4.307 = or i32 %conv4.286, %add.30
+  %conv4.30 = trunc i32 %conv4.307 to i8
+  %xor.31 = and i32 %or.30, -2
+  store i32 %xor.31, ptr @d, align 4
+  ret i8 %conv4.30
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index 66e3fbf..4cc3c12 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -1295,7 +1295,7 @@ define i8 @umin_intrinsic_rdx_v16i8(ptr %p0) {
 
 define void @PR49730() {
 ; CHECK-LABEL: @PR49730(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 1, i32 1>)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> <i32 2, i32 undef, i32 1, i32 undef>, <4 x i32> <i32 undef, i32 2, i32 undef, i32 1>)
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw <4 x i32> undef, [[TMP1]]
 ; CHECK-NEXT:    [[T12:%.*]] = sub nsw i32 undef, undef
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll b/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll
new file mode 100644
index 0000000..6b27015
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i32 @test(ptr %sptr, i64 %0) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[SPTR:%.*]], i64 [[TMP0:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV_I:%.*]] = trunc i64 [[TMP0]] to i32
+; CHECK-NEXT:    [[IV2:%.*]] = getelementptr i8, ptr [[SPTR]], i64 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[IV2]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CONV_I]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> <i32 1, i32 5, i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp sle <4 x i32> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp slt <4 x i32> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> [[TMP7]], <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i1> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP10]])
+; CHECK-NEXT:    [[AND33:%.*]] = zext i1 [[TMP11]] to i32
+; CHECK-NEXT:    ret i32 [[AND33]]
+;
+entry:
+  %conv.i = trunc i64 %0 to i32
+  %iv2 = getelementptr i8, ptr %sptr, i64 4
+  %1 = load i32, ptr %iv2, align 4
+  %cmp11 = icmp slt i32 %1, %conv.i
+  %cmp.i57 = icmp eq i32 %1, 0
+  %or.i5977 = or i1 %cmp.i57, %cmp11
+  %iv4 = getelementptr i8, ptr %sptr, i64 12
+  %2 = load i32, ptr %iv4, align 4
+  %cmp16 = icmp sle i32 %2, %conv.i
+  %cmp.i62 = icmp eq i32 %2, 0
+  %or.i6478 = or i1 %cmp.i62, %cmp16
+  %iv3 = getelementptr i8, ptr %sptr, i64 8
+  %3 = load i32, ptr %iv3, align 8
+  %cmp21 = icmp sgt i32 %3, %conv.i
+  %cmp.i67 = icmp eq i32 %3, 0
+  %or.i6979 = or i1 %cmp.i67, %cmp21
+  %iv5 = getelementptr i8, ptr %sptr, i64 16
+  %4 = load i32, ptr %iv5, align 8
+  %cmp26 = icmp slt i32 %conv.i, 0
+  %cmp.i72 = icmp eq i32 %4, 0
+  %or.i7480 = or i1 %cmp.i72, %cmp26
+  %and3183 = and i1 %or.i5977, %or.i6478
+  %and3284 = and i1 %and3183, %or.i6979
+  %and3385 = and i1 %and3284, %or.i7480
+  %and33 = zext i1 %and3385 to i32
+  ret i32 %and33
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
index fc28d7a..e1fd8a7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
@@ -19,8 +19,8 @@ define i1 @test(ptr noalias %0, i64 %1, ptr noalias %p, ptr %p1) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <2 x i24> [[TMP8]], <i24 24, i24 24>
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[TMP9]], <2 x i24> <i24 23, i24 23>, <2 x i24> [[TMP8]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = trunc <2 x i24> [[TMP10]] to <2 x i8>
-; CHECK-NEXT:    [[TMP11:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = and <2 x i32> [[TMP11]], <i32 254, i32 254>
+; CHECK-NEXT:    [[TMP26:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = and <2 x i32> [[TMP26]], <i32 254, i32 254>
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <2 x i32> [[TMP12]], <i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP25:%.*]] = select <2 x i1> [[TMP13]], <2 x i8> <i8 2, i8 2>, <2 x i8> [[TMP23]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = zext <2 x i8> [[TMP25]] to <2 x i32>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
index 136ab64..668d3c3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
@@ -10,12 +10,14 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP3:%.*]] = select i1 false, i32 0, i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> <i8 poison, i8 0, i8 poison, i8 poison>, i8 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i8> [[TMP5]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc <4 x i8> [[TMP5]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i8> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = sext <4 x i8> [[TMP8]] to <4 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i32> zeroinitializer, [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <4 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i8> [[TMP8]] to <4 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i1> zeroinitializer, [[TMP15]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <4 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i1> [[TMP16]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP13]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
index b5a3c57..acc04be 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -94,17 +94,13 @@ define i1 @logical_or_fcmp(<4 x float> %x) {
 
 define i1 @logical_and_icmp_diff_preds(<4 x i32> %x) {
 ; SSE-LABEL: @logical_and_icmp_diff_preds(
-; SSE-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; SSE-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; SSE-NEXT:    [[C0:%.*]] = icmp ult i32 [[X0]], 0
-; SSE-NEXT:    [[C2:%.*]] = icmp sgt i32 [[X2]], 0
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> <i32 3, i32 1>
-; SSE-NEXT:    [[TMP2:%.*]] = icmp slt <2 x i32> [[TMP1]], zeroinitializer
-; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
-; SSE-NEXT:    [[S1:%.*]] = select i1 [[C0]], i1 [[TMP3]], i1 false
-; SSE-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
-; SSE-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
-; SSE-NEXT:    [[S3:%.*]] = select i1 [[S2]], i1 [[TMP4]], i1 false
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, <4 x i32> <i32 1, i32 3, i32 6, i32 0>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+; SSE-NEXT:    [[TMP3:%.*]] = icmp slt <4 x i32> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = icmp ult <4 x i32> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
+; SSE-NEXT:    [[S3:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]])
 ; SSE-NEXT:    ret i1 [[S3]]
 ;
 ; AVX-LABEL: @logical_and_icmp_diff_preds(
@@ -391,17 +387,28 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) {
 }
 
 define i1 @logical_and_icmp_clamp_pred_diff(<4 x i32> %x) {
-; CHECK-LABEL: @logical_and_icmp_clamp_pred_diff(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], <i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
-; CHECK-NEXT:    [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT:    [[TMP7:%.*]] = freeze <4 x i1> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP7]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[TMP8]], i1 false
-; CHECK-NEXT:    ret i1 [[OP_RDX]]
+; SSE-LABEL: @logical_and_icmp_clamp_pred_diff(
+; SSE-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
+; SSE-NEXT:    [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], <i32 42, i32 42, i32 42, i32 42>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; SSE-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
+; SSE-NEXT:    [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; SSE-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]])
+; SSE-NEXT:    [[TMP7:%.*]] = freeze <4 x i1> [[TMP3]]
+; SSE-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP7]])
+; SSE-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[TMP8]], i1 false
+; SSE-NEXT:    ret i1 [[OP_RDX]]
+;
+; AVX-LABEL: @logical_and_icmp_clamp_pred_diff(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 2, i32 3>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 42, i32 42, i32 42, i32 poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 3>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> <i32 17, i32 17, i32 17, i32 17, i32 poison, i32 poison, i32 poison, i32 42>, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; AVX-NEXT:    [[TMP4:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[TMP5:%.*]] = icmp ult <8 x i32> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; AVX-NEXT:    [[TMP7:%.*]] = freeze <8 x i1> [[TMP6]]
+; AVX-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP7]])
+; AVX-NEXT:    ret i1 [[TMP8]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
index fb2b653..82085ad 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
@@ -12,10 +12,10 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x float> zeroinitializer, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX10_I_I86]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr undef, align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> <i32 1, i32 poison>
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP6]], <2 x float> [[TMP7]])
 ; CHECK-NEXT:    br i1 false, label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
index 46cca9b..1faeea7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
@@ -142,8 +142,8 @@ define void @gather_2(ptr %mat1, float %0, float %1) {
 ; CHECK-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x float> zeroinitializer)
 ; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00)
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul float [[TMP6]], 0.000000e+00
diff --git a/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll b/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
index 66229c2..8b131cc 100644
--- a/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
@@ -43,7 +43,7 @@ declare i32 @llvm.umin.i32(i32, i32)
 define void @test2() {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> <i32 2, i32 undef, i32 1, i32 undef>, <4 x i32> <i32 undef, i32 3, i32 undef, i32 0>)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw <4 x i32> undef, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 77)
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/non-probe-stale-profile-matching.prof b/llvm/test/Transforms/SampleProfile/Inputs/non-probe-stale-profile-matching.prof
new file mode 100644
index 0000000..8e98851
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/non-probe-stale-profile-matching.prof
@@ -0,0 +1,23 @@
+main:9229397:0
+ 0: 0
+ 1: 0
+ 1.1: 47663
+ 1.2: 51871
+ 2: 48723
+ 3: 48723 bar:49018
+ 4: 49087
+ 5: 51871 bar:49588
+ 7: 0
+ 2: foo:1479916
+  1: 47663
+  1.1: 46683 bar:43238
+  2: 4519 bar:4932
+  3: 48723
+ 4: foo:1505537
+  1: 48604
+  1.1: 46965 bar:44479
+  2: 4613 bar:4967
+  3: 49087
+bar:2333388:196222
+ 0: 194449
+ 1: 194449
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-profile.prof b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-profile.prof
index ba4c611..d384794 100644
--- a/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-profile.prof
+++ b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-profile.prof
@@ -1,8 +1,8 @@
 foo:3200:13
  1: 13
  2: 7
- 3: 6
- 4: 13
- 5: 7 _Z3barv:2 _Z3foov:5
- 6: 6 _Z3barv:4 _Z3foov:2
+ 4: 6
+ 6: 13
+ 3: 7 _Z3barv:2 _Z3foov:5
+ 5: 6 _Z3barv:4 _Z3foov:2
  !CFGChecksum: 563022570642068
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-update.prof b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-update.prof
index 62f9bd5..213bf0b 100644
--- a/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-update.prof
+++ b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-update.prof
@@ -1,8 +1,8 @@
 foo:3200:13
  1: 13
  2: 7
- 3: 6
- 4: 13
- 5: 7
- 6: 6
+ 4: 6
+ 6: 13
+ 7: 7
+ 9: 6
  !CFGChecksum: 844530426352218
diff --git a/llvm/test/Transforms/SampleProfile/non-probe-stale-profile-matching.ll b/llvm/test/Transforms/SampleProfile/non-probe-stale-profile-matching.ll
new file mode 100644
index 0000000..eb69c18a
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/non-probe-stale-profile-matching.ll
@@ -0,0 +1,229 @@
+; REQUIRES: x86_64-linux
+; REQUIRES: asserts
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/non-probe-stale-profile-matching.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl 2>&1 | FileCheck %s
+
+; The profiled source code:
+
+;  volatile int x = 1;
+;  __attribute__((noinline)) int bar(int p) {
+;    return p;
+;  }
+
+;  __attribute__((always_inline)) int foo(int i, int p) {
+;    if (i % 10) return  bar(p);
+;    else return bar(p + 1);
+;  }
+
+;  int main() {
+;    for (int i = 0; i < 1000 * 1000; i++) {
+;       x += foo(i, x);
+;       x += bar(x);
+;       x += foo(i, x);
+;       x += bar(x);
+;    }
+;  }
+
+; The source code for the current build:
+
+;  volatile int x = 1;
+;  __attribute__((noinline)) int bar(int p) {
+;    return p;
+;  }
+
+;  __attribute__((always_inline)) int foo(int i, int p) {
+;    if (i % 10) return  bar(p);
+;    else return bar(p + 1);
+;  }
+
+;  int main() {
+;    if (x == 0)          // code change
+;      return 0;          // code change
+;    for (int i = 0; i < 1000 * 1000; i++) {
+;       x += foo(i, x);
+;       x += bar(x);
+;       if (i < 0)        // code change
+;         return 0;       // code change
+;       x += foo(i, x);
+;       x += bar(x);
+;    }
+;  }
+
+; CHECK: Run stale profile matching for bar
+
+; CHECK: Run stale profile matching for foo
+; CHECK: Callsite with callee:bar is matched from 1.1 to 1.1
+; CHECK: Callsite with callee:bar is matched from 2 to 2
+
+; CHECK: Run stale profile matching for main
+; CHECK: Callsite with callee:foo is matched from 4 to 2
+; CHECK: Callsite with callee:bar is matched from 5 to 3
+; CHECK: Callsite with callee:foo is matched from 8 to 4
+; CHECK: Callsite with callee:bar is matched from 9 to 5
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@x = dso_local global i32 1, align 4
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local i32 @bar(i32 noundef %p) #0 !dbg !9 {
+entry:
+  ret i32 %p, !dbg !13
+}
+
+; Function Attrs: alwaysinline nounwind uwtable
+define dso_local i32 @foo(i32 noundef %i, i32 noundef %p) #1 !dbg !14 {
+entry:
+  %rem = srem i32 %i, 10, !dbg !15
+  %tobool = icmp ne i32 %rem, 0, !dbg !15
+  br i1 %tobool, label %if.then, label %if.else, !dbg !16
+
+if.then:                                          ; preds = %entry
+  %call = call i32 @bar(i32 noundef %p), !dbg !17
+  br label %return, !dbg !19
+
+if.else:                                          ; preds = %entry
+  %add = add nsw i32 %p, 1, !dbg !20
+  %call1 = call i32 @bar(i32 noundef %add), !dbg !21
+  br label %return, !dbg !22
+
+return:                                           ; preds = %if.else, %if.then
+  %retval.0 = phi i32 [ %call, %if.then ], [ %call1, %if.else ], !dbg !23
+  ret i32 %retval.0, !dbg !24
+}
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @main() #2 !dbg !25 {
+entry:
+  %0 = load volatile i32, ptr @x, align 4, !dbg !26, !tbaa !27
+  %cmp = icmp eq i32 %0, 0, !dbg !31
+  br i1 %cmp, label %if.then, label %if.end, !dbg !26
+
+if.then:                                          ; preds = %entry
+  br label %for.end, !dbg !32
+
+if.end:                                           ; preds = %entry
+  br label %for.cond, !dbg !33
+
+for.cond:                                         ; preds = %if.end6, %if.end
+  %i.0 = phi i32 [ 0, %if.end ], [ %inc, %if.end6 ], !dbg !34
+  %cmp1 = icmp slt i32 %i.0, 1000000, !dbg !35
+  br i1 %cmp1, label %for.body, label %for.cond.cleanup, !dbg !37
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  br label %cleanup, !dbg !38
+
+for.body:                                         ; preds = %for.cond
+  %1 = load volatile i32, ptr @x, align 4, !dbg !40, !tbaa !27
+  %call = call i32 @foo(i32 noundef %i.0, i32 noundef %1), !dbg !41
+  %2 = load volatile i32, ptr @x, align 4, !dbg !42, !tbaa !27
+  %add = add nsw i32 %2, %call, !dbg !42
+  store volatile i32 %add, ptr @x, align 4, !dbg !42, !tbaa !27
+  %3 = load volatile i32, ptr @x, align 4, !dbg !43, !tbaa !27
+  %call2 = call i32 @bar(i32 noundef %3), !dbg !44
+  %4 = load volatile i32, ptr @x, align 4, !dbg !45, !tbaa !27
+  %add3 = add nsw i32 %4, %call2, !dbg !45
+  store volatile i32 %add3, ptr @x, align 4, !dbg !45, !tbaa !27
+  br i1 false, label %if.then5, label %if.end6, !dbg !46
+
+if.then5:                                         ; preds = %for.body
+  br label %cleanup, !dbg !47
+
+if.end6:                                          ; preds = %for.body
+  %5 = load volatile i32, ptr @x, align 4, !dbg !48, !tbaa !27
+  %call7 = call i32 @foo(i32 noundef %i.0, i32 noundef %5), !dbg !49
+  %6 = load volatile i32, ptr @x, align 4, !dbg !50, !tbaa !27
+  %add8 = add nsw i32 %6, %call7, !dbg !50
+  store volatile i32 %add8, ptr @x, align 4, !dbg !50, !tbaa !27
+  %7 = load volatile i32, ptr @x, align 4, !dbg !51, !tbaa !27
+  %call9 = call i32 @bar(i32 noundef %7), !dbg !52
+  %8 = load volatile i32, ptr @x, align 4, !dbg !53, !tbaa !27
+  %add10 = add nsw i32 %8, %call9, !dbg !53
+  store volatile i32 %add10, ptr @x, align 4, !dbg !53, !tbaa !27
+  %inc = add nsw i32 %i.0, 1, !dbg !54
+  br label %for.cond, !dbg !56, !llvm.loop !57
+
+cleanup:                                          ; preds = %if.then5, %for.cond.cleanup
+  br label %for.end
+
+for.end:                                          ; preds = %cleanup, %if.then
+  ret i32 0, !dbg !61
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #3
+
+attributes #0 = { noinline nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" }
+attributes #1 = { alwaysinline nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" }
+attributes #2 = { nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7}
+!llvm.ident = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 19.0.0git", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "path")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !{!"clang version 19.0.0git"}
+!9 = distinct !DISubprogram(name: "bar", scope: !10, file: !10, line: 2, type: !11, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!10 = !DIFile(filename: "test.c", directory: "path")
+!11 = !DISubroutineType(types: !12)
+!12 = !{}
+!13 = !DILocation(line: 3, column: 3, scope: !9)
+!14 = distinct !DISubprogram(name: "foo", scope: !10, file: !10, line: 6, type: !11, scopeLine: 6, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!15 = !DILocation(line: 7, column: 9, scope: !14)
+!16 = !DILocation(line: 7, column: 7, scope: !14)
+!17 = !DILocation(line: 7, column: 23, scope: !18)
+!18 = !DILexicalBlockFile(scope: !14, file: !10, discriminator: 2)
+!19 = !DILocation(line: 7, column: 15, scope: !18)
+!20 = !DILocation(line: 8, column: 21, scope: !14)
+!21 = !DILocation(line: 8, column: 15, scope: !14)
+!22 = !DILocation(line: 8, column: 8, scope: !14)
+!23 = !DILocation(line: 0, scope: !14)
+!24 = !DILocation(line: 9, column: 1, scope: !14)
+!25 = distinct !DISubprogram(name: "main", scope: !10, file: !10, line: 11, type: !11, scopeLine: 11, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!26 = !DILocation(line: 12, column: 7, scope: !25)
+!27 = !{!28, !28, i64 0}
+!28 = !{!"int", !29, i64 0}
+!29 = !{!"omnipotent char", !30, i64 0}
+!30 = !{!"Simple C/C++ TBAA"}
+!31 = !DILocation(line: 12, column: 9, scope: !25)
+!32 = !DILocation(line: 13, column: 5, scope: !25)
+!33 = !DILocation(line: 14, column: 8, scope: !25)
+!34 = !DILocation(line: 14, scope: !25)
+!35 = !DILocation(line: 14, column: 21, scope: !36)
+!36 = !DILexicalBlockFile(scope: !25, file: !10, discriminator: 2)
+!37 = !DILocation(line: 14, column: 3, scope: !36)
+!38 = !DILocation(line: 14, column: 3, scope: !39)
+!39 = !DILexicalBlockFile(scope: !25, file: !10, discriminator: 4)
+!40 = !DILocation(line: 15, column: 18, scope: !25)
+!41 = !DILocation(line: 15, column: 11, scope: !25)
+!42 = !DILocation(line: 15, column: 8, scope: !25)
+!43 = !DILocation(line: 16, column: 15, scope: !25)
+!44 = !DILocation(line: 16, column: 11, scope: !25)
+!45 = !DILocation(line: 16, column: 8, scope: !25)
+!46 = !DILocation(line: 17, column: 10, scope: !25)
+!47 = !DILocation(line: 18, column: 8, scope: !25)
+!48 = !DILocation(line: 19, column: 18, scope: !25)
+!49 = !DILocation(line: 19, column: 11, scope: !25)
+!50 = !DILocation(line: 19, column: 8, scope: !25)
+!51 = !DILocation(line: 20, column: 15, scope: !25)
+!52 = !DILocation(line: 20, column: 11, scope: !25)
+!53 = !DILocation(line: 20, column: 8, scope: !25)
+!54 = !DILocation(line: 14, column: 37, scope: !55)
+!55 = !DILexicalBlockFile(scope: !25, file: !10, discriminator: 6)
+!56 = !DILocation(line: 14, column: 3, scope: !55)
+!57 = distinct !{!57, !58, !59, !60}
+!58 = !DILocation(line: 14, column: 3, scope: !25)
+!59 = !DILocation(line: 21, column: 3, scope: !25)
+!60 = !{!"llvm.loop.mustprogress"}
+!61 = !DILocation(line: 22, column: 1, scope: !25)
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-callee-profile-mismatch.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-callee-profile-mismatch.ll
index 4881937..43be142 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-callee-profile-mismatch.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-callee-profile-mismatch.ll
@@ -1,7 +1,9 @@
 ; REQUIRES: x86_64-linux
 ; REQUIRES: asserts
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-callee-profile-mismatch.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl  -pass-remarks=inline 2>&1 | FileCheck %s
+; RUN: opt < %s -passes='thinlto<O2>' -pgo-kind=pgo-sample-use-pipeline  -sample-profile-file=%S/Inputs/pseudo-probe-callee-profile-mismatch.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl  -pass-remarks=inline 2>&1 | FileCheck %s
 
+; There is no profile-checksum-mismatch attr, even the checksum is mismatched in the pseudo_probe_desc, it doesn't run the matching.
+; CHECK-NOT: Run stale profile matching for main
 
 ; CHECK: Run stale profile matching for bar
 ; CHECK: Callsite with callee:baz is matched from 4 to 2
@@ -14,7 +16,7 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define i32 @main() #0 {
+define available_externally i32 @main() #0 {
   %1 = call i32 @bar(), !dbg !13
   ret i32 0
 }
@@ -47,7 +49,8 @@ attributes #1 = { "profile-checksum-mismatch" "use-sample-profile" }
 !9 = distinct !DICompileUnit(language: DW_LANG_C11, file: !10, producer: "clang version 19.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
 !10 = !DIFile(filename: "test2.c", directory: "/home/test", checksumkind: CSK_MD5, checksum: "553093afc026f9c73562eb3b0c5b7532")
 !11 = !{i32 2, !"Debug Info Version", i32 3}
-!12 = !{i64 -2624081020897602054, i64 281582081721716, !"main"}
+; Make a checksum mismatch in the pseudo_probe_desc
+!12 = !{i64 -2624081020897602054, i64 123456, !"main"}
 !13 = !DILocation(line: 8, column: 10, scope: !14)
 !14 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 186646591)
 !15 = distinct !DILexicalBlock(scope: !16, file: !1, line: 7, column: 40)
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-dangle.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-dangle.ll
index 4647a34f..f0b6fdf 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-dangle.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-dangle.ll
@@ -23,21 +23,21 @@ Merge:
 ; JT-LABEL-NO: T
 ; JT-LABEL-NO: F
 ; JT-LABEL: Merge
+; JT-NOT: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4
 ; JT-NOT: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 3
-; JT-NOT: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 2
-; JT: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
+; JT: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 5, i32 0, i64 -1)
+; ASM-NOT: .pseudoprobe	6699318081062747564 4
 ; ASM-NOT: .pseudoprobe	6699318081062747564 3
-; ASM-NOT: .pseudoprobe	6699318081062747564 2
-; ASM: .pseudoprobe	6699318081062747564 4 0 0
+; ASM: .pseudoprobe	6699318081062747564 5 0 0
 	ret i32 %call
 }
 
 ;; Check block T and F are gone, and their probes (probe 2 and 3) are gone too.
 ; MIR-tail: bb.0
 ; MIR-tail: PSEUDO_PROBE [[#GUID:]], 1, 0, 0
-; MIR-tail-NOT: PSEUDO_PROBE [[#GUID:]], 2
 ; MIR-tail-NOT: PSEUDO_PROBE [[#GUID:]], 3
-; MIR-tail: PSEUDO_PROBE [[#GUID:]], 4, 0, 0
+; MIR-tail-NOT: PSEUDO_PROBE [[#GUID:]], 4
+; MIR-tail: PSEUDO_PROBE [[#GUID:]], 5, 0, 0
 
 
 define i32 @test(i32 %a, i32 %b, i32 %c) {
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll
index 62f0737..97b0ed6 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll
@@ -62,10 +62,10 @@ attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "fra
 ; DEBUG: ![[INST]] = !DILocation(line: 4, column: 15, scope: ![[INSTBLOCK:[0-9]+]])
 ; DEBUG: ![[INSTBLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 4)
 
-           
+
 ; PROBE: ![[CALL1]] = !DILocation(line: 4, column: 3, scope: ![[CALL1BLOCK:[0-9]+]])
-; PROBE: ![[CALL1BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 186646575)
+; PROBE: ![[CALL1BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 186646559)
 ; PROBE: ![[CALL2]] = !DILocation(line: 4, column: 9, scope: ![[CALL2BLOCK:[0-9]+]])
-; PROBE: ![[CALL2BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 186646583)
+; PROBE: ![[CALL2BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 186646567)
 ; PROBE: ![[INST]] = !DILocation(line: 4, column: 15, scope: ![[INSTBLOCK:[0-9]+]])
 ; PROBE: ![[INSTBLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 4)
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-invoke.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-invoke.ll
index 822ab40..03bb64b 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-invoke.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-invoke.ll
@@ -18,10 +18,12 @@ entry:
 
 if.then:                                          ; preds = %entry
 ; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 2
+; callsite probe 3
   invoke void @_Z3foov()
           to label %invoke.cont unwind label %terminate.lpad, !dbg !24
 
 invoke.cont:                                      ; preds = %if.then
+; callsite probe 4
 ; CHECK-NOT: call void @llvm.pseudoprobe(i64 -1069303473483922844,
   invoke void @_Z3bazv()
           to label %invoke.cont1 unwind label %terminate.lpad, !dbg !26
@@ -31,7 +33,8 @@ invoke.cont1:                                     ; preds = %invoke.cont
   br label %if.end, !dbg !27
 
 if.else:                                          ; preds = %entry
-; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 3
+; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 5
+; callsite probe 6
   invoke void @_Z3foov()
           to label %invoke.cont2 unwind label %terminate.lpad, !dbg !28
 
@@ -40,7 +43,8 @@ invoke.cont2:                                     ; preds = %if.else
   br label %if.end
 
 if.end:                                           ; preds = %invoke.cont2, %invoke.cont1
-; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 4
+; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 7
+; callsite probe 8
   invoke void @_Z3foov()
           to label %invoke.cont3 unwind label %terminate.lpad, !dbg !29
 
@@ -51,14 +55,14 @@ invoke.cont3:                                     ; preds = %if.end
   br i1 %tobool4, label %if.then5, label %if.end6, !dbg !32
 
 if.then5:                                         ; preds = %invoke.cont3
-; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 5
+; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 9
   %2 = load volatile i32, ptr @x, align 4, !dbg !33, !tbaa !19
   %inc = add nsw i32 %2, 1, !dbg !33
   store volatile i32 %inc, ptr @x, align 4, !dbg !33, !tbaa !19
   br label %if.end6, !dbg !35
 
 if.end6:                                          ; preds = %if.then5, %invoke.cont3
-; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 6
+; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 10
   ret void, !dbg !36
 
 terminate.lpad:                                   ; preds = %if.end, %if.else, %invoke.cont, %if.then
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-profile-metadata-2.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-profile-metadata-2.ll
index 148f3ed..379dcfc 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-profile-metadata-2.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-profile-metadata-2.ll
@@ -29,7 +29,7 @@ if.else:
   br label %return
 
 return:
-  call void @llvm.pseudoprobe(i64 6699318081062747564, i64 4, i32 0, i64 -1)
+  call void @llvm.pseudoprobe(i64 6699318081062747564, i64 6, i32 0, i64 -1)
   %1 = load i32, ptr %retval, align 4
   ret i32 %1
 }
@@ -55,13 +55,12 @@ attributes #0 = {"use-sample-profile"}
 !9 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !5, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
 !10 = !{!"function_entry_count", i64 14}
 !11 = !{!"branch_weights", i32 100, i32 0}
-;; A discriminator of 186646575 which is 0x6f80057 in hexdecimal, stands for an indirect call probe
-;; with an index of 5 and probe factor of 1.0.
-!12 = !DILexicalBlockFile(scope: !4, file: !5, discriminator: 186646575)
+;; A discriminator of 186646559 which is 0xB20001F in hexdecimal, stands for an indirect call probe
+;; with an index of 3 and probe factor of 1.0.
+!12 = !DILexicalBlockFile(scope: !4, file: !5, discriminator: 186646559)
 !13 = distinct !DILocation(line: 10, column: 11, scope: !12)
-;; A discriminator of 134217775 which is 0x6f80057 in hexdecimal, stands for an indirect call probe
-;; with an index of 5 and probe factor of 0.
-!14 = !DILexicalBlockFile(scope: !4, file: !5, discriminator: 134217775)
+;; A discriminator of 134217759 which is 0x800001F in hexdecimal, stands for an indirect call probe
+;; with an index of 3 and probe factor of 0.
+!14 = !DILexicalBlockFile(scope: !4, file: !5, discriminator: 134217759)
 !15 = distinct !DILocation(line: 10, column: 11, scope: !14)
 !16 = !{!"VP", i32 0, i64 7, i64 9191153033785521275, i64 5, i64 -1069303473483922844, i64 2}
-
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll
index 474b666..867a49d 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll
@@ -22,12 +22,12 @@ if.then:
 if.else:
   ; CHECK: call {{.*}}, !dbg ![[#PROBE2:]], !prof ![[PROF2:[0-9]+]]
   call void %f(i32 2)
-  ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 3, i32 0, i64 -1)
+  ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
   store i32 2, ptr %retval, align 4
   br label %return
 
 return:
-  ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
+  ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 6, i32 0, i64 -1)
   %1 = load i32, ptr %retval, align 4
   ret i32 %1
 }
@@ -36,14 +36,14 @@ attributes #0 = {"use-sample-profile"}
 
 ; CHECK: ![[PD1]] = !{!"branch_weights", i32 8, i32 7}
 ; CHECK: ![[#PROBE1]] = !DILocation(line: 0, scope: ![[#SCOPE1:]])
+;; A discriminator of 119537695 which is 0x720001f in hexdecimal, stands for an indirect call probe
+;; with an index of 3.
+; CHECK: ![[#SCOPE1]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 119537695)
+; CHECK: ![[PROF1]] = !{!"VP", i32 0, i64 7, i64 9191153033785521275, i64 5, i64 -1069303473483922844, i64 2}
 ;; A discriminator of 119537711 which is 0x720002f in hexdecimal, stands for an indirect call probe
 ;; with an index of 5.
-; CHECK: ![[#SCOPE1]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 119537711)
-; CHECK: ![[PROF1]] = !{!"VP", i32 0, i64 7, i64 9191153033785521275, i64 5, i64 -1069303473483922844, i64 2}
-;; A discriminator of 119537719 which is 0x7200037 in hexdecimal, stands for an indirect call probe
-;; with an index of 6.
 ; CHECK: ![[#PROBE2]] = !DILocation(line: 0, scope: ![[#SCOPE2:]])
-; CHECK: ![[#SCOPE2]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 119537719)
+; CHECK: ![[#SCOPE2]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 119537711)
 ; CHECK: ![[PROF2]] = !{!"VP", i32 0, i64 6, i64 -1069303473483922844, i64 4, i64 9191153033785521275, i64 2}
 
 !llvm.module.flags = !{!9, !10}
@@ -83,7 +83,7 @@ attributes #0 = {"use-sample-profile"}
 ;YAML-NEXT:    - String:          'Applied '
 ;YAML-NEXT:    - NumSamples:      '7'
 ;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
-;YAML-NEXT:    - ProbeId:         '5'
+;YAML-NEXT:    - ProbeId:         '3'
 ;YAML-NEXT:    - String:          ', Factor='
 ;YAML-NEXT:    - Factor:          '1.000000e+00'
 ;YAML-NEXT:    - String:          ', OriginalSamples='
@@ -113,7 +113,7 @@ attributes #0 = {"use-sample-profile"}
 ;YAML-NEXT:    - String:          'Applied '
 ;YAML-NEXT:    - NumSamples:      '6'
 ;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
-;YAML-NEXT:    - ProbeId:         '6'
+;YAML-NEXT:    - ProbeId:         '5'
 ;YAML-NEXT:    - String:          ', Factor='
 ;YAML-NEXT:    - Factor:          '1.000000e+00'
 ;YAML-NEXT:    - String:          ', OriginalSamples='
@@ -128,7 +128,7 @@ attributes #0 = {"use-sample-profile"}
 ;YAML-NEXT:    - String:          'Applied '
 ;YAML-NEXT:    - NumSamples:      '6'
 ;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
-;YAML-NEXT:    - ProbeId:         '3'
+;YAML-NEXT:    - ProbeId:         '4'
 ;YAML-NEXT:    - String:          ', Factor='
 ;YAML-NEXT:    - Factor:          '1.000000e+00'
 ;YAML-NEXT:    - String:          ', OriginalSamples='
@@ -143,7 +143,7 @@ attributes #0 = {"use-sample-profile"}
 ;YAML-NEXT:    - String:          'Applied '
 ;YAML-NEXT:    - NumSamples:      '13'
 ;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
-;YAML-NEXT:    - ProbeId:         '4'
+;YAML-NEXT:    - ProbeId:         '6'
 ;YAML-NEXT:    - String:          ', Factor='
 ;YAML-NEXT:    - Factor:          '1.000000e+00'
 ;YAML-NEXT:    - String:          ', OriginalSamples='
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-update.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-update.ll
index 992afed..217b619 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-update.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-update.ll
@@ -14,15 +14,15 @@ T1:
 	%v1 = call i32 @f1()
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 2, i32 0, i64 -1)
 ;; The distribution factor -8513881372706734080 stands for 53.85%, whic is from 7/6+7.
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -8513881372706734080)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 6, i32 0, i64 -8513881372706734080)
     %cond3 = icmp eq i32 %v1, 412
 	br label %Merge
 F1:
 ; CHECK: %v2 = call i32 @f2(), !prof ![[#PROF2:]]
 	%v2 = call i32 @f2()
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 3, i32 0, i64 -1)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
 ;; The distribution factor 8513881922462547968 stands for 46.25%, which is from 6/6+7.
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 8513881922462547968)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 6, i32 0, i64 8513881922462547968)
 	br label %Merge
 Merge:
 
@@ -30,11 +30,11 @@ Merge:
 	%B = phi i32 [%v1, %T1], [%v2, %F1]
 	br i1 %A, label %T2, label %F2
 T2:
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 5, i32 0, i64 -1)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 7, i32 0, i64 -1)
 	call void @f3()
 	ret i32 %B
 F2:
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 6, i32 0, i64 -1)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 9, i32 0, i64 -1)
 	ret i32 %B
 }
 
@@ -42,4 +42,3 @@ F2:
 ; CHECK: ![[#PROF2]] = !{!"branch_weights", i32 6}
 
 attributes #0 = {"use-sample-profile"}
-
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-verify.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-verify.ll
index f70e518..b622cfb 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-verify.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-verify.ll
@@ -4,7 +4,7 @@
 
 ; VERIFY: *** Pseudo Probe Verification After LoopFullUnrollPass ***
 ; VERIFY: Function foo:
-; VERIFY-DAG: Probe 6	previous factor 1.00	current factor 5.00
+; VERIFY-DAG: Probe 5	previous factor 1.00	current factor 5.00
 ; VERIFY-DAG: Probe 4	previous factor 1.00	current factor 5.00
 
 declare void @foo2() nounwind
@@ -27,15 +27,15 @@ bb7.preheader:
 
 bb10:
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
-; CHECK: call void @foo2(), !dbg ![[#PROBE6:]] 
+; CHECK: call void @foo2(), !dbg ![[#PROBE6:]]
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
-; CHECK: call void @foo2(), !dbg ![[#PROBE6:]] 
+; CHECK: call void @foo2(), !dbg ![[#PROBE6:]]
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
-; CHECK: call void @foo2(), !dbg ![[#PROBE6:]] 
+; CHECK: call void @foo2(), !dbg ![[#PROBE6:]]
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
-; CHECK: call void @foo2(), !dbg ![[#PROBE6:]] 
+; CHECK: call void @foo2(), !dbg ![[#PROBE6:]]
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
-; CHECK: call void @foo2(), !dbg ![[#PROBE6:]] 
+; CHECK: call void @foo2(), !dbg ![[#PROBE6:]]
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 2, i32 0, i64 -1)
   %indvars.iv = phi i64 [ 0, %bb7.preheader ], [ %indvars.iv.next, %bb10 ]
   %tmp1.14 = phi i32 [ %tmp1.06, %bb7.preheader ], [ %spec.select, %bb10 ]
@@ -50,14 +50,14 @@ bb10:
   br i1 %exitcond.not, label %bb3.loopexit, label %bb10, !llvm.loop !13
 
 bb24:
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 5, i32 0, i64 -1)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 6, i32 0, i64 -1)
   ret void
 }
 
 ;; A discriminator of 186646583 which is 0xb200037 in hexdecimal, stands for a direct call probe
 ;; with an index of 6 and a scale of -1%.
 ; CHECK: ![[#PROBE6]] = !DILocation(line: 2, column: 20, scope: ![[#SCOPE:]])
-; CHECK: ![[#SCOPE]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 186646583)
+; CHECK: ![[#SCOPE]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 186646575)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10}
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
new file mode 100644
index 0000000..2031c2d
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
@@ -0,0 +1,228 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX
+
+; standard vector concatenations
+
+define <16 x i32> @concat_zext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @concat_zext_v8i16_v16i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = zext <16 x i16> [[TMP1]] to <16 x i32>
+; CHECK-NEXT:    ret <16 x i32> [[R]]
+;
+  %x0 = zext <8 x i16> %a0 to <8 x i32>
+  %x1 = zext <8 x i16> %a1 to <8 x i32>
+  %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i32> %r
+}
+
+define <16 x i32> @concat_zext_nneg_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @concat_zext_nneg_v8i16_v16i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = zext nneg <16 x i16> [[TMP1]] to <16 x i32>
+; CHECK-NEXT:    ret <16 x i32> [[R]]
+;
+  %x0 = zext nneg <8 x i16> %a0 to <8 x i32>
+  %x1 = zext nneg <8 x i16> %a1 to <8 x i32>
+  %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i32> %r
+}
+
+define <16 x i32> @concat_sext_zext_nneg_v8i16_v8i32(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: @concat_sext_zext_nneg_v8i16_v8i32(
+; SSE-NEXT:    [[X0:%.*]] = sext <8 x i16> [[A0:%.*]] to <8 x i32>
+; SSE-NEXT:    [[X1:%.*]] = zext nneg <8 x i16> [[A1:%.*]] to <8 x i32>
+; SSE-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i32> [[R]]
+;
+; AVX-LABEL: @concat_sext_zext_nneg_v8i16_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
+; AVX-NEXT:    ret <16 x i32> [[R]]
+;
+  %x0 = sext <8 x i16> %a0 to <8 x i32>
+  %x1 = zext nneg <8 x i16> %a1 to <8 x i32>
+  %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i32> %r
+}
+
+define <16 x i32> @concat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @concat_sext_v8i16_v16i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
+; CHECK-NEXT:    ret <16 x i32> [[R]]
+;
+  %x0 = sext <8 x i16> %a0 to <8 x i32>
+  %x1 = sext <8 x i16> %a1 to <8 x i32>
+  %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i32> %r
+}
+
+define <8 x i32> @concat_sext_v4i1_v8i32(<4 x i1> %a0, <4 x i1> %a1) {
+; CHECK-LABEL: @concat_sext_v4i1_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[A0:%.*]], <4 x i1> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    ret <8 x i32> [[R]]
+;
+  %x0 = sext <4 x i1> %a0 to <4 x i32>
+  %x1 = sext <4 x i1> %a1 to <4 x i32>
+  %r = shufflevector <4 x i32> %x0, <4 x i32> %x1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %r
+}
+
+define <8 x i16> @concat_trunc_v4i32_v8i16(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @concat_trunc_v4i32_v8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[R]]
+;
+  %x0 = trunc <4 x i32> %a0 to <4 x i16>
+  %x1 = trunc <4 x i32> %a1 to <4 x i16>
+  %r = shufflevector <4 x i16> %x0, <4 x i16> %x1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %r
+}
+
+define <8 x ptr> @concat_inttoptr_v4i32_v8iptr(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @concat_inttoptr_v4i32_v8iptr(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = inttoptr <8 x i32> [[TMP1]] to <8 x ptr>
+; CHECK-NEXT:    ret <8 x ptr> [[R]]
+;
+  %x0 = inttoptr <4 x i32> %a0 to <4 x ptr>
+  %x1 = inttoptr <4 x i32> %a1 to <4 x ptr>
+  %r = shufflevector <4 x ptr> %x0, <4 x ptr> %x1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x ptr> %r
+}
+
+define <16 x i64> @concat_ptrtoint_v8i16_v16i32(<8 x ptr> %a0, <8 x ptr> %a1) {
+; CHECK-LABEL: @concat_ptrtoint_v8i16_v16i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[A0:%.*]], <8 x ptr> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = ptrtoint <16 x ptr> [[TMP1]] to <16 x i64>
+; CHECK-NEXT:    ret <16 x i64> [[R]]
+;
+  %x0 = ptrtoint <8 x ptr> %a0 to <8 x i64>
+  %x1 = ptrtoint <8 x ptr> %a1 to <8 x i64>
+  %r = shufflevector <8 x i64> %x0, <8 x i64> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i64> %r
+}
+
+define <8 x double> @concat_fpext_v4f32_v8f64(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: @concat_fpext_v4f32_v8f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[R:%.*]] = fpext <8 x float> [[TMP1]] to <8 x double>
+; SSE-NEXT:    ret <8 x double> [[R]]
+;
+; AVX-LABEL: @concat_fpext_v4f32_v8f64(
+; AVX-NEXT:    [[X0:%.*]] = fpext <4 x float> [[A0:%.*]] to <4 x double>
+; AVX-NEXT:    [[X1:%.*]] = fpext <4 x float> [[A1:%.*]] to <4 x double>
+; AVX-NEXT:    [[R:%.*]] = shufflevector <4 x double> [[X0]], <4 x double> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    ret <8 x double> [[R]]
+;
+  %x0 = fpext <4 x float> %a0 to <4 x double>
+  %x1 = fpext <4 x float> %a1 to <4 x double>
+  %r = shufflevector <4 x double> %x0, <4 x double> %x1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %r
+}
+
+define <16 x float> @concat_fptrunc_v8f64_v16f32(<8 x double> %a0, <8 x double> %a1) {
+; CHECK-LABEL: @concat_fptrunc_v8f64_v16f32(
+; CHECK-NEXT:    [[X0:%.*]] = fptrunc <8 x double> [[A0:%.*]] to <8 x float>
+; CHECK-NEXT:    [[X1:%.*]] = fptrunc <8 x double> [[A1:%.*]] to <8 x float>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[X0]], <8 x float> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x float> [[R]]
+;
+  %x0 = fptrunc <8 x double> %a0 to <8 x float>
+  %x1 = fptrunc <8 x double> %a1 to <8 x float>
+  %r = shufflevector <8 x float> %x0, <8 x float> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %r
+}
+
+; commuted vector concatenation
+
+define <16 x i32> @rconcat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @rconcat_sext_v8i16_v16i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
+; CHECK-NEXT:    ret <16 x i32> [[R]]
+;
+  %x0 = sext <8 x i16> %a0 to <8 x i32>
+  %x1 = sext <8 x i16> %a1 to <8 x i32>
+  %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <16 x i32> %r
+}
+
+; interleaved shuffle
+
+define <8 x double> @interleave_fpext_v4f32_v8f64(<4 x float> %a0, <4 x float> %a1) {
+; CHECK-LABEL: @interleave_fpext_v4f32_v8f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fpext <8 x float> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    ret <8 x double> [[R]]
+;
+  %x0 = fpext <4 x float> %a0 to <4 x double>
+  %x1 = fpext <4 x float> %a1 to <4 x double>
+  %r = shufflevector <4 x double> %x0, <4 x double> %x1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x double> %r
+}
+
+; negative - multiuse
+
+define <8 x i16> @concat_trunc_v4i32_v8i16_multiuse(<4 x i32> %a0, <4 x i32> %a1, ptr %a2) {
+; CHECK-LABEL: @concat_trunc_v4i32_v8i16_multiuse(
+; CHECK-NEXT:    [[X0:%.*]] = trunc <4 x i32> [[A0:%.*]] to <4 x i16>
+; CHECK-NEXT:    [[X1:%.*]] = trunc <4 x i32> [[A1:%.*]] to <4 x i16>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i16> [[X0]], <4 x i16> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    store <4 x i16> [[X0]], ptr [[A2:%.*]], align 8
+; CHECK-NEXT:    ret <8 x i16> [[R]]
+;
+  %x0 = trunc <4 x i32> %a0 to <4 x i16>
+  %x1 = trunc <4 x i32> %a1 to <4 x i16>
+  %r = shufflevector <4 x i16> %x0, <4 x i16> %x1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <4 x i16> %x0, ptr %a2
+  ret <8 x i16> %r
+}
+
+; negative - bitcasts
+
+define <8 x float> @concat_bitcast_v4i32_v8f32(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @concat_bitcast_v4i32_v8f32(
+; CHECK-NEXT:    [[X0:%.*]] = bitcast <4 x i32> [[A0:%.*]] to <4 x float>
+; CHECK-NEXT:    [[X1:%.*]] = bitcast <4 x i32> [[A1:%.*]] to <4 x float>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[X0]], <4 x float> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
+  %x0 = bitcast <4 x i32> %a0 to <4 x float>
+  %x1 = bitcast <4 x i32> %a1 to <4 x float>
+  %r = shufflevector <4 x float> %x0, <4 x float> %x1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %r
+}
+
+; negative - src type mismatch
+
+define <8 x i32> @concat_sext_v4i8_v4i16_v8i32(<4 x i8> %a0, <4 x i16> %a1) {
+; CHECK-LABEL: @concat_sext_v4i8_v4i16_v8i32(
+; CHECK-NEXT:    [[X0:%.*]] = sext <4 x i8> [[A0:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[X1:%.*]] = sext <4 x i16> [[A1:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[X0]], <4 x i32> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[R]]
+;
+  %x0 = sext <4 x i8> %a0 to <4 x i32>
+  %x1 = sext <4 x i16> %a1 to <4 x i32>
+  %r = shufflevector <4 x i32> %x0, <4 x i32> %x1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %r
+}
+
+; negative - castop mismatch
+
+define <16 x i32> @concat_sext_zext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @concat_sext_zext_v8i16_v16i32(
+; CHECK-NEXT:    [[X0:%.*]] = sext <8 x i16> [[A0:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[X1:%.*]] = zext <8 x i16> [[A1:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i32> [[R]]
+;
+  %x0 = sext <8 x i16> %a0 to <8 x i32>
+  %x1 = zext <8 x i16> %a1 to <8 x i32>
+  %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i32> %r
+}
diff --git a/llvm/test/Verifier/module-flags-note-gnu-property-elf-pauthabi.ll b/llvm/test/Verifier/module-flags-note-gnu-property-elf-pauthabi.ll
new file mode 100644
index 0000000..435073d
--- /dev/null
+++ b/llvm/test/Verifier/module-flags-note-gnu-property-elf-pauthabi.ll
@@ -0,0 +1,19 @@
+; RUN: rm -rf %t && split-file %s %t && cd %t
+
+; CHECK: either both or no 'aarch64-elf-pauthabi-platform' and 'aarch64-elf-pauthabi-version' module flags must be present
+
+;--- err1.ll
+
+; RUN: not llvm-as err1.ll -o /dev/null 2>&1 | FileCheck %s
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"aarch64-elf-pauthabi-platform", i32 2}
+
+;--- err2.ll
+
+; RUN: not llvm-as err2.ll -o /dev/null 2>&1 | FileCheck %s
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"aarch64-elf-pauthabi-version", i32 31}
diff --git a/llvm/test/Verifier/pr69428.ll b/llvm/test/Verifier/pr69428.ll
new file mode 100644
index 0000000..be8733b
--- /dev/null
+++ b/llvm/test/Verifier/pr69428.ll
@@ -0,0 +1,48 @@
+; RUN: llvm-as -disable-output %s
+
+%struct._List_node_emplace_op2 = type { i8 }
+
+@"?_List@@3HA" = global i32 0, align 4
+
+define void @"?ExecutionEngineaddExecutableDependency@@YAXXZ"() personality ptr @__CxxFrameHandler3 {
+entry:
+  %agg.tmp.ensured.i = alloca %struct._List_node_emplace_op2, align 1
+  %0 = load i32, ptr @"?_List@@3HA", align 4
+  %call.i = call noundef ptr @"??0?$_List_node_emplace_op2@H@@QEAA@H@Z"(ptr %agg.tmp.ensured.i, i32 %0)
+  invoke void @llvm.seh.scope.begin()
+          to label %invoke.cont.i unwind label %ehcleanup.i
+
+invoke.cont.i:                                    ; preds = %entry
+  invoke void @llvm.seh.scope.end()
+          to label %invoke.cont2.i unwind label %ehcleanup.i
+
+invoke.cont2.i:                                   ; preds = %invoke.cont.i
+  call void @"??1?$_List_node_emplace_op2@H@@QEAA@XZ"(ptr %agg.tmp.ensured.i) #6
+  unreachable
+
+ehcleanup.i:                                      ; preds = %invoke.cont.i, %entry
+  %1 = cleanuppad within none []
+  invoke void @llvm.seh.scope.begin()
+          to label %invoke.cont.i.i unwind label %ehcleanup.i.i
+
+invoke.cont.i.i:                                  ; preds = %ehcleanup.i
+  invoke void @llvm.seh.scope.end()
+          to label %"??1?$_List_node_emplace_op2@H@@QEAA@XZ.exit.i" unwind label %ehcleanup.i.i
+
+ehcleanup.i.i:                                    ; preds = %invoke.cont.i.i, %ehcleanup.i
+  %2 = cleanuppad within %1 []
+  call void @"??1_Alloc_construct_ptr@@QEAA@XZ"(ptr %agg.tmp.ensured.i) #6 [ "funclet"(token %2) ]
+  cleanupret from %2 unwind to caller
+
+"??1?$_List_node_emplace_op2@H@@QEAA@XZ.exit.i":  ; preds = %invoke.cont.i.i
+  call void @"??1_Alloc_construct_ptr@@QEAA@XZ"(ptr %agg.tmp.ensured.i) #6 [ "funclet"(token %1) ]
+  cleanupret from %1 unwind to caller
+}
+
+declare i32 @__CxxFrameHandler3(...)
+declare void @llvm.seh.scope.begin()
+declare void @llvm.seh.scope.end()
+
+declare void @"??1?$_List_node_emplace_op2@H@@QEAA@XZ"(ptr)
+declare void @"??1_Alloc_construct_ptr@@QEAA@XZ"(ptr)
+declare ptr @"??0?$_List_node_emplace_op2@H@@QEAA@H@Z"(ptr, i32)
diff --git a/llvm/test/tools/dsymutil/ARM/firmware.test b/llvm/test/tools/dsymutil/ARM/firmware.test
new file mode 100644
index 0000000..128faa5
--- /dev/null
+++ b/llvm/test/tools/dsymutil/ARM/firmware.test
@@ -0,0 +1,11 @@
+$ cat test.c
+int main() {
+  return 0;
+}
+
+$ xcrun clang -O0 -target arm64-apple-unknown-macho test.c -c -o test.o
+$ xcrun ld -arch arm64 -o test.out test.o -platform_version firmware 0 0
+
+RUN: dsymutil -oso-prepend-path %p/../Inputs %p/../Inputs/private/tmp/firmware/test.out -o %t.dSYM
+RUN: llvm-objdump -h %t.dSYM/Contents/Resources/DWARF/test.out | FileCheck %s
+CHECK: file format mach-o arm64
diff --git a/llvm/test/tools/dsymutil/Inputs/private/tmp/firmware/test.o b/llvm/test/tools/dsymutil/Inputs/private/tmp/firmware/test.o
new file mode 100644
index 0000000..3bc83ca
--- /dev/null
+++ b/llvm/test/tools/dsymutil/Inputs/private/tmp/firmware/test.o
diff --git a/llvm/test/tools/dsymutil/Inputs/private/tmp/firmware/test.out b/llvm/test/tools/dsymutil/Inputs/private/tmp/firmware/test.out
new file mode 100755
index 0000000..21fe4d2
--- /dev/null
+++ b/llvm/test/tools/dsymutil/Inputs/private/tmp/firmware/test.out
diff --git a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s
index 98b8619..1b196b4 100644
--- a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s
@@ -1189,9 +1189,9 @@ vzeroupper
 # CHECK-NEXT:  3      9     1.00                        vdppd	$22, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  4      14    1.00    *                   vdppd	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  4      14    2.00                        vdpps	$22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  5      19    2.00    *                   vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  6      19    2.00    *                   vdpps	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  4      14    2.00                        vdpps	$22, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  5      20    2.00    *                   vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  6      20    2.00    *                   vdpps	$22, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      3     1.00                        vextractf128	$1, %ymm0, %xmm2
 # CHECK-NEXT:  2      1     1.00           *            vextractf128	$1, %ymm0, (%rax)
 # CHECK-NEXT:  2      2     1.00                        vextractps	$1, %xmm0, %ecx
@@ -1632,17 +1632,17 @@ vzeroupper
 # CHECK-NEXT:  4      17    2.00    *                   vrcpps	(%rax), %ymm2
 # CHECK-NEXT:  1      5     1.00                        vrcpss	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  2      10    1.00    *                   vrcpss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   vroundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      6     2.00                        vroundpd	$1, %ymm0, %ymm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  1      6     0.50                        vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   vroundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      6     2.00                        vroundps	$1, %ymm0, %ymm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundps	$1, (%rax), %ymm2
-# CHECK-NEXT:  1      6     0.50                        vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundsd	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   vroundsd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundss	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   vroundss	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     1.00                        vrsqrtps	%xmm0, %xmm2
 # CHECK-NEXT:  2      10    1.00    *                   vrsqrtps	(%rax), %xmm2
@@ -1736,7 +1736,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     257.00 215.25 235.25 176.17 176.17 38.00  424.25 2.25   12.67
+# CHECK-NEXT:  -     257.00 216.25 247.25 173.17 173.17 38.00  424.25 3.25   12.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -1899,9 +1899,9 @@ vzeroupper
 # CHECK-NEXT:  -      -     1.00   1.00    -      -      -     1.00    -      -     vdppd	$22, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.00   1.00   0.50   0.50    -     1.00    -      -     vdppd	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     2.00   1.00    -      -      -     1.00    -      -     vdpps	$22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     2.00   1.00   0.50   0.50    -     1.00    -      -     vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     2.50   1.00   0.50   0.50    -     1.00   0.50    -     vdpps	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     2.00   1.00    -      -      -     1.00    -      -     vdpps	$22, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     2.00   1.00   0.50   0.50    -     1.00    -      -     vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     2.50   1.00   0.50   0.50    -     1.00   0.50    -     vdpps	$22, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vextractf128	$1, %ymm0, %xmm2
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vextractf128	$1, %ymm0, (%rax)
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     vextractps	$1, %xmm0, %ecx
@@ -2342,17 +2342,17 @@ vzeroupper
 # CHECK-NEXT:  -      -     2.33   0.33   0.50   0.50    -     0.33    -      -     vrcpps	(%rax), %ymm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vrcpss	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     vrcpss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundpd	$1, %ymm0, %ymm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundps	$1, %ymm0, %ymm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundps	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundsd	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundsd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundss	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundss	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vrsqrtps	%xmm0, %xmm2
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     vrsqrtps	(%rax), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-sse41.s
index a2899b4..4865121 100644
--- a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-sse41.s
@@ -166,7 +166,7 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  3      9     1.00                        dppd	$22, %xmm0, %xmm2
 # CHECK-NEXT:  4      14    1.00    *                   dppd	$22, (%rax), %xmm2
 # CHECK-NEXT:  4      14    2.00                        dpps	$22, %xmm0, %xmm2
-# CHECK-NEXT:  5      19    2.00    *                   dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  6      19    2.00    *                   dpps	$22, (%rax), %xmm2
 # CHECK-NEXT:  2      2     1.00                        extractps	$1, %xmm0, %ecx
 # CHECK-NEXT:  3      2     1.00           *            extractps	$1, %xmm0, (%rax)
 # CHECK-NEXT:  1      1     1.00                        insertps	$1, %xmm0, %xmm2
@@ -243,13 +243,13 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  3      15    2.00    *                   pmulld	(%rax), %xmm2
 # CHECK-NEXT:  2      2     1.00                        ptest	%xmm0, %xmm1
 # CHECK-NEXT:  3      7     1.00    *                   ptest	(%rax), %xmm1
-# CHECK-NEXT:  1      6     0.50                        roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   roundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   roundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundsd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   roundsd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundss	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   roundss	$1, (%rax), %xmm2
 
 # CHECK:      Resources:
@@ -266,7 +266,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     23.33  22.33  25.67  25.67  5.00   80.33   -     1.67
+# CHECK-NEXT:  -      -     23.83  30.33  23.67  23.67  5.00   80.33  0.50   1.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -281,7 +281,7 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00   1.00    -      -      -     1.00    -      -     dppd	$22, %xmm0, %xmm2
 # CHECK-NEXT:  -      -     1.00   1.00   0.50   0.50    -     1.00    -      -     dppd	$22, (%rax), %xmm2
 # CHECK-NEXT:  -      -     2.00   1.00    -      -      -     1.00    -      -     dpps	$22, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     2.00   1.00   0.50   0.50    -     1.00    -      -     dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  -      -     2.50   1.00   0.50   0.50    -     1.00   0.50    -     dpps	$22, (%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     extractps	$1, %xmm0, %ecx
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00   1.00    -     0.33   extractps	$1, %xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     insertps	$1, %xmm0, %xmm2
@@ -358,11 +358,11 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -     2.00    -     0.50   0.50    -      -      -      -     pmulld	(%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     ptest	%xmm0, %xmm1
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -     1.00    -      -     ptest	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundsd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundsd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundss	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundss	$1, (%rax), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Haswell/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Haswell/resources-avx1.s
index 376070d..05c4760 100644
--- a/llvm/test/tools/llvm-mca/X86/Haswell/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/Haswell/resources-avx1.s
@@ -1189,9 +1189,9 @@ vzeroupper
 # CHECK-NEXT:  3      9     1.00                        vdppd	$22, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  4      15    1.00    *                   vdppd	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  4      14    2.00                        vdpps	$22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  5      20    2.00    *                   vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  6      20    2.00    *                   vdpps	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  4      14    2.00                        vdpps	$22, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  5      21    2.00    *                   vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  6      21    2.00    *                   vdpps	$22, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      3     1.00                        vextractf128	$1, %ymm0, %xmm2
 # CHECK-NEXT:  2      1     1.00           *            vextractf128	$1, %ymm0, (%rax)
 # CHECK-NEXT:  2      2     1.00                        vextractps	$1, %xmm0, %ecx
@@ -1632,17 +1632,17 @@ vzeroupper
 # CHECK-NEXT:  4      18    2.00    *                   vrcpps	(%rax), %ymm2
 # CHECK-NEXT:  1      5     1.00                        vrcpss	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  2      10    1.00    *                   vrcpss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      6     2.00                        vroundpd	$1, %ymm0, %ymm2
 # CHECK-NEXT:  3      13    2.00    *                   vroundpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  1      6     0.50                        vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      6     2.00                        vroundps	$1, %ymm0, %ymm2
 # CHECK-NEXT:  3      13    2.00    *                   vroundps	$1, (%rax), %ymm2
-# CHECK-NEXT:  1      6     0.50                        vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundsd	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundsd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundss	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundss	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     1.00                        vrsqrtps	%xmm0, %xmm2
 # CHECK-NEXT:  2      11    1.00    *                   vrsqrtps	(%rax), %xmm2
@@ -1736,7 +1736,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     336.00 214.58 236.58 176.17 176.17 38.00  427.58 2.25   12.67
+# CHECK-NEXT:  -     336.00 215.58 248.58 173.17 173.17 38.00  427.58 3.25   12.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -1899,9 +1899,9 @@ vzeroupper
 # CHECK-NEXT:  -      -     1.00   1.00    -      -      -     1.00    -      -     vdppd	$22, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.00   1.00   0.50   0.50    -     1.00    -      -     vdppd	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     2.00   1.00    -      -      -     1.00    -      -     vdpps	$22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     2.00   1.00   0.50   0.50    -     1.00    -      -     vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     2.50   1.00   0.50   0.50    -     1.00   0.50    -     vdpps	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     2.00   1.00    -      -      -     1.00    -      -     vdpps	$22, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     2.00   1.00   0.50   0.50    -     1.00    -      -     vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     2.50   1.00   0.50   0.50    -     1.00   0.50    -     vdpps	$22, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vextractf128	$1, %ymm0, %xmm2
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vextractf128	$1, %ymm0, (%rax)
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     vextractps	$1, %xmm0, %ecx
@@ -2342,17 +2342,17 @@ vzeroupper
 # CHECK-NEXT:  -      -     2.33   0.33   0.50   0.50    -     0.33    -      -     vrcpps	(%rax), %ymm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vrcpss	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     vrcpss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundpd	$1, %ymm0, %ymm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundps	$1, %ymm0, %ymm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundps	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundsd	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundsd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundss	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundss	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vrsqrtps	%xmm0, %xmm2
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     vrsqrtps	(%rax), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Haswell/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/Haswell/resources-sse41.s
index 70d9398..62dfa23 100644
--- a/llvm/test/tools/llvm-mca/X86/Haswell/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/Haswell/resources-sse41.s
@@ -166,7 +166,7 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  3      9     1.00                        dppd	$22, %xmm0, %xmm2
 # CHECK-NEXT:  4      15    1.00    *                   dppd	$22, (%rax), %xmm2
 # CHECK-NEXT:  4      14    2.00                        dpps	$22, %xmm0, %xmm2
-# CHECK-NEXT:  5      20    2.00    *                   dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  6      20    2.00    *                   dpps	$22, (%rax), %xmm2
 # CHECK-NEXT:  2      2     1.00                        extractps	$1, %xmm0, %ecx
 # CHECK-NEXT:  3      2     1.00           *            extractps	$1, %xmm0, (%rax)
 # CHECK-NEXT:  1      1     1.00                        insertps	$1, %xmm0, %xmm2
@@ -243,13 +243,13 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  3      16    2.00    *                   pmulld	(%rax), %xmm2
 # CHECK-NEXT:  2      2     1.00                        ptest	%xmm0, %xmm1
 # CHECK-NEXT:  3      8     1.00    *                   ptest	(%rax), %xmm1
-# CHECK-NEXT:  1      6     0.50                        roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   roundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   roundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundsd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   roundsd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundss	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   roundss	$1, (%rax), %xmm2
 
 # CHECK:      Resources:
@@ -266,7 +266,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     23.33  22.33  25.67  25.67  5.00   80.33   -     1.67
+# CHECK-NEXT:  -      -     23.83  30.33  23.67  23.67  5.00   80.33  0.50   1.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -281,7 +281,7 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00   1.00    -      -      -     1.00    -      -     dppd	$22, %xmm0, %xmm2
 # CHECK-NEXT:  -      -     1.00   1.00   0.50   0.50    -     1.00    -      -     dppd	$22, (%rax), %xmm2
 # CHECK-NEXT:  -      -     2.00   1.00    -      -      -     1.00    -      -     dpps	$22, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     2.00   1.00   0.50   0.50    -     1.00    -      -     dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  -      -     2.50   1.00   0.50   0.50    -     1.00   0.50    -     dpps	$22, (%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     extractps	$1, %xmm0, %ecx
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00   1.00    -     0.33   extractps	$1, %xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     insertps	$1, %xmm0, %xmm2
@@ -358,11 +358,11 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -     2.00    -     0.50   0.50    -      -      -      -     pmulld	(%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     ptest	%xmm0, %xmm1
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -     1.00    -      -     ptest	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundsd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundsd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundss	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundss	$1, (%rax), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
index c2e0217..ef5a9e3 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
@@ -1189,9 +1189,9 @@ vzeroupper
 # CHECK-NEXT:  3      9     1.00                        vdppd	$22, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  4      15    1.00    *                   vdppd	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  4      13    1.50                        vdpps	$22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  5      19    1.50    *                   vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  6      19    1.50    *                   vdpps	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  4      13    1.50                        vdpps	$22, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  5      20    1.50    *                   vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  6      20    1.50    *                   vdpps	$22, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      3     1.00                        vextractf128	$1, %ymm0, %xmm2
 # CHECK-NEXT:  2      1     1.00           *            vextractf128	$1, %ymm0, (%rax)
 # CHECK-NEXT:  2      3     1.00                        vextractps	$1, %xmm0, %ecx
@@ -1736,7 +1736,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     126.00 338.58 199.58 173.83 173.83 38.00  326.58 5.25   11.33
+# CHECK-NEXT:  -     126.00 339.58 199.58 173.83 173.83 38.00  326.58 6.25   11.33
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -1899,9 +1899,9 @@ vzeroupper
 # CHECK-NEXT:  -      -     1.00   1.00    -      -      -     1.00    -      -     vdppd	$22, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.00   1.00   0.50   0.50    -     1.00    -      -     vdppd	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.50   1.50    -      -      -     1.00    -      -     vdpps	$22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.50   1.50   0.50   0.50    -     1.00    -      -     vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     2.00   1.50   0.50   0.50    -     1.00   0.50    -     vdpps	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.50   1.50    -      -      -     1.00    -      -     vdpps	$22, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     1.50   1.50   0.50   0.50    -     1.00    -      -     vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     2.00   1.50   0.50   0.50    -     1.00   0.50    -     vdpps	$22, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vextractf128	$1, %ymm0, %xmm2
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vextractf128	$1, %ymm0, (%rax)
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     vextractps	$1, %xmm0, %ecx
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s
index 6e11bb6..1d8d67f 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s
@@ -166,7 +166,7 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  3      9     1.00                        dppd	$22, %xmm0, %xmm2
 # CHECK-NEXT:  4      15    1.00    *                   dppd	$22, (%rax), %xmm2
 # CHECK-NEXT:  4      13    1.50                        dpps	$22, %xmm0, %xmm2
-# CHECK-NEXT:  5      19    1.50    *                   dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  6      19    1.50    *                   dpps	$22, (%rax), %xmm2
 # CHECK-NEXT:  2      3     1.00                        extractps	$1, %xmm0, %ecx
 # CHECK-NEXT:  3      2     1.00           *            extractps	$1, %xmm0, (%rax)
 # CHECK-NEXT:  1      1     1.00                        insertps	$1, %xmm0, %xmm2
@@ -266,7 +266,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     37.33  31.33  23.67  23.67  5.00   63.33   -     1.67
+# CHECK-NEXT:  -      -     37.83  31.33  23.67  23.67  5.00   63.33  0.50   1.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -281,7 +281,7 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00   1.00    -      -      -     1.00    -      -     dppd	$22, %xmm0, %xmm2
 # CHECK-NEXT:  -      -     1.00   1.00   0.50   0.50    -     1.00    -      -     dppd	$22, (%rax), %xmm2
 # CHECK-NEXT:  -      -     1.50   1.50    -      -      -     1.00    -      -     dpps	$22, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.50   1.50   0.50   0.50    -     1.00    -      -     dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  -      -     2.00   1.50   0.50   0.50    -     1.00   0.50    -     dpps	$22, (%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     extractps	$1, %xmm0, %ecx
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00   1.00    -     0.33   extractps	$1, %xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     insertps	$1, %xmm0, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx1.s
index de14ef7..cabb002 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx1.s
@@ -1188,10 +1188,10 @@ vzeroupper
 # CHECK-NEXT:  2      16    3.00    *                   vdivss	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  3      9     1.00                        vdppd	$22, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  4      15    1.00    *                   vdppd	$22, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  4      13    1.33                        vdpps	$22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  5      19    1.33    *                   vdpps	$22, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  4      13    1.33                        vdpps	$22, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  5      20    1.33    *                   vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  4      13    1.50                        vdpps	$22, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  6      19    1.50    *                   vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  4      13    1.50                        vdpps	$22, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  6      20    1.50    *                   vdpps	$22, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      3     1.00                        vextractf128	$1, %ymm0, %xmm2
 # CHECK-NEXT:  2      1     1.00           *            vextractf128	$1, %ymm0, (%rax)
 # CHECK-NEXT:  2      3     1.00                        vextractps	$1, %xmm0, %ecx
@@ -1736,7 +1736,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     126.00 322.25 200.25 173.83 173.83 38.00  330.25 6.25   11.33
+# CHECK-NEXT:  -     126.00 325.25 202.25 173.83 173.83 38.00  326.25 7.25   11.33
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -1898,10 +1898,10 @@ vzeroupper
 # CHECK-NEXT:  -     3.00   1.00    -     0.50   0.50    -      -      -      -     vdivss	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     0.67   0.67    -      -      -     1.67    -      -     vdppd	$22, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     0.67   0.67   0.50   0.50    -     1.67    -      -     vdppd	$22, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -     2.00    -      -     vdpps	$22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00   0.50   0.50    -     2.00    -      -     vdpps	$22, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -     2.00    -      -     vdpps	$22, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     1.00   1.00   0.50   0.50    -     2.00    -      -     vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     1.50   1.50    -      -      -     1.00    -      -     vdpps	$22, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     2.00   1.50   0.50   0.50    -     1.00   0.50    -     vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.50   1.50    -      -      -     1.00    -      -     vdpps	$22, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     2.00   1.50   0.50   0.50    -     1.00   0.50    -     vdpps	$22, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vextractf128	$1, %ymm0, %xmm2
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vextractf128	$1, %ymm0, (%rax)
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     vextractps	$1, %xmm0, %ecx
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-sse41.s
index 15cd09b..e3f34fd 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-sse41.s
@@ -165,8 +165,8 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  3      8     0.67    *                   blendvps	%xmm0, (%rax), %xmm2
 # CHECK-NEXT:  3      9     1.00                        dppd	$22, %xmm0, %xmm2
 # CHECK-NEXT:  4      15    1.00    *                   dppd	$22, (%rax), %xmm2
-# CHECK-NEXT:  4      13    1.33                        dpps	$22, %xmm0, %xmm2
-# CHECK-NEXT:  5      19    1.33    *                   dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  4      13    1.50                        dpps	$22, %xmm0, %xmm2
+# CHECK-NEXT:  6      19    1.50    *                   dpps	$22, (%rax), %xmm2
 # CHECK-NEXT:  2      3     1.00                        extractps	$1, %xmm0, %ecx
 # CHECK-NEXT:  3      2     1.00           *            extractps	$1, %xmm0, (%rax)
 # CHECK-NEXT:  1      1     1.00                        insertps	$1, %xmm0, %xmm2
@@ -266,7 +266,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     36.67  28.67  23.67  23.67  5.00   66.67   -     1.67
+# CHECK-NEXT:  -      -     38.17  29.67  23.67  23.67  5.00   64.67  0.50   1.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -280,8 +280,8 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -     0.67   0.67   0.50   0.50    -     0.67    -      -     blendvps	%xmm0, (%rax), %xmm2
 # CHECK-NEXT:  -      -     0.67   0.67    -      -      -     1.67    -      -     dppd	$22, %xmm0, %xmm2
 # CHECK-NEXT:  -      -     0.67   0.67   0.50   0.50    -     1.67    -      -     dppd	$22, (%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -     2.00    -      -     dpps	$22, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00   0.50   0.50    -     2.00    -      -     dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  -      -     1.50   1.50    -      -      -     1.00    -      -     dpps	$22, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     2.00   1.50   0.50   0.50    -     1.00   0.50    -     dpps	$22, (%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     extractps	$1, %xmm0, %ecx
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00   1.00    -     0.33   extractps	$1, %xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     insertps	$1, %xmm0, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/dependency-breaking-gpr.s b/llvm/test/tools/llvm-mca/X86/Znver3/dependency-breaking-gpr.s
index 4654ce1..349abec 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/dependency-breaking-gpr.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/dependency-breaking-gpr.s
@@ -68,12 +68,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -145,12 +145,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -223,12 +223,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -306,12 +306,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -389,12 +389,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -472,12 +472,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-hi-read-advance.s b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-hi-read-advance.s
index 12d6f39..0fcd6f5 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-hi-read-advance.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-hi-read-advance.s
@@ -46,12 +46,12 @@ add %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -122,12 +122,12 @@ add %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-lo-reg-use.s b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-lo-reg-use.s
index 93f8d76..cd427bb 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-lo-reg-use.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-lo-reg-use.s
@@ -41,12 +41,12 @@ mulxq %rax, %rax, %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -112,12 +112,12 @@ mulxq %rax, %rax, %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-read-advance.s b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-read-advance.s
index 13ef5bc..bf82486 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-read-advance.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-read-advance.s
@@ -43,12 +43,12 @@ mulxq (%rdi), %rax, %rdx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -115,12 +115,12 @@ mulxq (%rdi), %rax, %rdx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-same-regs.s b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-same-regs.s
index bfe8be8..8a5a014 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-same-regs.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-same-regs.s
@@ -44,12 +44,12 @@ mulxq %rax, %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -116,12 +116,12 @@ mulxq %rax, %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-xmm.s
index 1431875..f0e16a8 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-xmm.s
@@ -68,12 +68,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -159,12 +159,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -250,12 +250,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -341,12 +341,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-ymm.s b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-ymm.s
index eb2bb97..97f6a34 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-ymm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-ymm.s
@@ -68,12 +68,12 @@ vpaddq %ymm0, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -159,12 +159,12 @@ vpaddq %ymm0, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -250,12 +250,12 @@ vpaddq %ymm0, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -341,12 +341,12 @@ vpaddq %ymm0, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-mmx.s b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-mmx.s
index 5909af8..c733f63 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-mmx.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-mmx.s
@@ -63,12 +63,12 @@ paddd %mm0, %mm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -154,12 +154,12 @@ paddd %mm0, %mm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -245,12 +245,12 @@ paddd %mm0, %mm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-sse-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-sse-xmm.s
index 5a05487..63df99e 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-sse-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-sse-xmm.s
@@ -68,12 +68,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -159,12 +159,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -250,12 +250,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -341,12 +341,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-3.s b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-3.s
index 7ac674c..66c1322 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-3.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-3.s
@@ -40,12 +40,12 @@ xor %bx, %dx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-4.s b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-4.s
index 582da14..4ed529e 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-4.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-4.s
@@ -40,12 +40,12 @@ add %cx, %bx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-5.s b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-5.s
index dda87e9..5894111 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-5.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-5.s
@@ -33,12 +33,12 @@ lzcnt %ax, %bx  ## partial register stall.
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-6.s b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-6.s
index 71520ea..fdbf4d9 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-6.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-6.s
@@ -42,12 +42,12 @@ lzcnt 2(%rsp), %cx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-xmm.s
index 7afa80c..f3e515c 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-xmm.s
@@ -180,12 +180,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -474,12 +474,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -768,12 +768,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1062,12 +1062,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1356,12 +1356,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1650,12 +1650,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-ymm.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-ymm.s
index 8b81d55..a484a75 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-ymm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-ymm.s
@@ -180,12 +180,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -474,12 +474,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -768,12 +768,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1062,12 +1062,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1356,12 +1356,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1650,12 +1650,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-gpr.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-gpr.s
index f359048..eb20d13 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-gpr.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-gpr.s
@@ -134,12 +134,12 @@ xchgq %r15, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -402,12 +402,12 @@ xchgq %r15, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -670,12 +670,12 @@ xchgq %r15, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -938,12 +938,12 @@ xchgq %r15, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-mmx.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-mmx.s
index b556fd6..e17d671 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-mmx.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-mmx.s
@@ -61,12 +61,12 @@ movq %mm7, %mm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-sse-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-sse-xmm.s
index 147cb0f..b45fd17 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-sse-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-sse-xmm.s
@@ -180,12 +180,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -474,12 +474,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -768,12 +768,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1062,12 +1062,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1356,12 +1356,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1650,12 +1650,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-x87.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-x87.s
index de59edf..0465d41 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-x87.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-x87.s
@@ -67,12 +67,12 @@ fxch %st(0)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-adx.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-adx.s
index 4e024e5..9c5a19b 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-adx.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-adx.s
@@ -38,12 +38,12 @@ adox        (%rbx), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-aes.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-aes.s
index 5abf3cc..d108696 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-aes.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-aes.s
@@ -50,12 +50,12 @@ aeskeygenassist $22, (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx1.s
index 146b3ce..4f0b484 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx1.s
@@ -1731,12 +1731,12 @@ vzeroupper
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx2.s
index 3c6b31a..1a8b9e2 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx2.s
@@ -771,12 +771,12 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi1.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi1.s
index 8c0e841..2600237 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi1.s
@@ -85,12 +85,12 @@ tzcnt       (%rax), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi2.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi2.s
index 8d00c99..0664c1d 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi2.s
@@ -100,12 +100,12 @@ shrx        %rax, (%rbx), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-clflushopt.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-clflushopt.s
index 3e7219c..b40d155 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-clflushopt.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-clflushopt.s
@@ -23,12 +23,12 @@ clflushopt (%rax)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-clzero.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-clzero.s
index 0dc89fa..0f9935c 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-clzero.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-clzero.s
@@ -23,12 +23,12 @@ clzero
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmov.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmov.s
index e0e46af..8118e40 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmov.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmov.s
@@ -218,12 +218,12 @@ cmovgq    (%rax), %rdi
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmpxchg.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmpxchg.s
index 03763e5..9ab8776 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmpxchg.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmpxchg.s
@@ -25,12 +25,12 @@ cmpxchg16b (%rax)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-f16c.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-f16c.s
index bb995d5..345ae02 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-f16c.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-f16c.s
@@ -40,12 +40,12 @@ vcvtps2ph   $0, %ymm0, (%rax)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-fma.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-fma.s
index 9af180d..af207f0 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-fma.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-fma.s
@@ -500,12 +500,12 @@ vfnmsub231ss (%rax), %xmm1, %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-fsgsbase.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-fsgsbase.s
index 142508c..3e65183 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-fsgsbase.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-fsgsbase.s
@@ -40,12 +40,12 @@ wrgsbase %rdi
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-lea.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-lea.s
index 1545a22..0257202 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-lea.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-lea.s
@@ -293,12 +293,12 @@ lea 1024(%rax, %rbx, 2), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-lzcnt.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-lzcnt.s
index ffbe414..735287a 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-lzcnt.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-lzcnt.s
@@ -35,12 +35,12 @@ lzcntq      (%rax), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-mmx.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-mmx.s
index 75dbf95..2bc6177 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-mmx.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-mmx.s
@@ -279,12 +279,12 @@ pxor        (%rax), %mm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-movbe.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-movbe.s
index 144e97f..6eeabbd 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-movbe.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-movbe.s
@@ -35,12 +35,12 @@ movbe  (%rax), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-mwaitx.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-mwaitx.s
index 3b343d7..103fd3eb 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-mwaitx.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-mwaitx.s
@@ -25,12 +25,12 @@ mwaitx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-pclmul.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-pclmul.s
index 2d9f0e9..893f476 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-pclmul.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-pclmul.s
@@ -25,12 +25,12 @@ pclmulqdq     $11, (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-popcnt.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-popcnt.s
index cce078f..29bcc5c 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-popcnt.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-popcnt.s
@@ -35,12 +35,12 @@ popcntq     (%rax), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-prefetchw.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-prefetchw.s
index 5423b6b..b80e8f7 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-prefetchw.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-prefetchw.s
@@ -25,12 +25,12 @@ prefetchw   (%rax)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdrand.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdrand.s
index fb09253..649eb10 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdrand.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdrand.s
@@ -27,12 +27,12 @@ rdrand   %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdseed.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdseed.s
index f10a90f..44e0eeb 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdseed.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdseed.s
@@ -27,12 +27,12 @@ rdseed   %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sha.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sha.s
index 360a667..e6d5ab9 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sha.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sha.s
@@ -55,12 +55,12 @@ sha256rnds2 (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse1.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse1.s
index 9816b87..4c7a3f0 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse1.s
@@ -328,12 +328,12 @@ xorps       (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse2.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse2.s
index f69c535..d24aebf 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse2.s
@@ -684,12 +684,12 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse3.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse3.s
index 8110390..51bb95f 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse3.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse3.s
@@ -74,12 +74,12 @@ mwait
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse41.s
index 0cc6c6a..e952a16 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse41.s
@@ -261,12 +261,12 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse42.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse42.s
index 873e4f4..8afcd80 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse42.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse42.s
@@ -70,12 +70,12 @@ pcmpgtq     (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse4a.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse4a.s
index 1c1b0b2..6606a3e 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse4a.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse4a.s
@@ -35,12 +35,12 @@ movntss     %xmm0, (%rax)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-ssse3.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-ssse3.s
index aeec493..6668870 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-ssse3.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-ssse3.s
@@ -180,12 +180,12 @@ psignw      (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-vaes.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-vaes.s
index 076094f..81afc7d 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-vaes.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-vaes.s
@@ -40,12 +40,12 @@ vaesenclast      (%rax), %ymm1, %ymm3
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-vpclmulqdq.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-vpclmulqdq.s
index 31680d5..10440e9 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-vpclmulqdq.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-vpclmulqdq.s
@@ -25,12 +25,12 @@ vpclmulqdq    $11, (%rax), %ymm1, %ymm3
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_32.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_32.s
index fb09b65..8f627ca 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_32.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_32.s
@@ -56,12 +56,12 @@ salc
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_64.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_64.s
index fedb3d2..41ec631 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_64.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_64.s
@@ -1957,12 +1957,12 @@ xorq (%rax), %rdi
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x87.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x87.s
index 9a92bd0..cd8a06a 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x87.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x87.s
@@ -364,12 +364,12 @@ fyl2xp1
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-xsave.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-xsave.s
index 819361c..f348ff8 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-xsave.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-xsave.s
@@ -35,12 +35,12 @@ xsetbv
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-xmm.s
index 33657e6..ed4e8f9 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-xmm.s
@@ -138,12 +138,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -229,12 +229,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -320,12 +320,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -411,12 +411,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -502,12 +502,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -593,12 +593,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -684,12 +684,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -775,12 +775,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -866,12 +866,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -957,12 +957,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1048,12 +1048,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1139,12 +1139,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1230,12 +1230,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1321,12 +1321,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1412,12 +1412,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1503,12 +1503,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1594,12 +1594,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1685,12 +1685,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-ymm.s b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-ymm.s
index ba7f51e..2404336 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-ymm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-ymm.s
@@ -148,12 +148,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -239,12 +239,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -330,12 +330,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -421,12 +421,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -512,12 +512,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -603,12 +603,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -694,12 +694,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -785,12 +785,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -876,12 +876,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -967,12 +967,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1058,12 +1058,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1149,12 +1149,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1240,12 +1240,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1331,12 +1331,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1422,12 +1422,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1513,12 +1513,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1604,12 +1604,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1695,12 +1695,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1786,12 +1786,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1878,12 +1878,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-gpr.s b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-gpr.s
index 018adc2..4d648f7 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-gpr.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-gpr.s
@@ -68,12 +68,12 @@ addq %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -175,12 +175,12 @@ addq %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -282,12 +282,12 @@ addq %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -389,12 +389,12 @@ addq %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-sse-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-sse-xmm.s
index 935881a..aca39c5 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-sse-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-sse-xmm.s
@@ -138,12 +138,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -229,12 +229,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -320,12 +320,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -411,12 +411,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -502,12 +502,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -593,12 +593,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -684,12 +684,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -775,12 +775,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -866,12 +866,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -957,12 +957,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1048,12 +1048,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1139,12 +1139,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1230,12 +1230,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1321,12 +1321,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1412,12 +1412,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1503,12 +1503,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1594,12 +1594,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1685,12 +1685,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-objcopy/ELF/compress-sections-within-segment.s b/llvm/test/tools/llvm-objcopy/ELF/compress-sections-within-segment.s
new file mode 100644
index 0000000..064ffca
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/ELF/compress-sections-within-segment.s
@@ -0,0 +1,38 @@
+## Disallow (de)compression for sections within a segment as they are
+## effectively immutable.
+# RUN: rm -rf %t && mkdir %t && cd %t
+# RUN: yaml2obj %s -o a
+# RUN: not llvm-objcopy a /dev/null --compress-sections .text=zlib 2>&1 | FileCheck %s --implicit-check-not=error:
+
+# CHECK: error: 'a': section '.text' within a segment cannot be (de)compressed
+
+# RUN: not llvm-objcopy a /dev/null --compress-sections foo=none 2>&1 | FileCheck %s --check-prefix=CHECK2 --implicit-check-not=error:
+
+# CHECK2: error: 'a': section 'foo' within a segment cannot be (de)compressed
+
+## There is an error even if 'foo' is already compressed with zlib.
+# RUN: not llvm-objcopy a /dev/null --compress-sections foo=zlib 2>&1 | FileCheck %s --check-prefix=CHECK3 --implicit-check-not=error:
+
+# CHECK3: error: 'a': section 'foo' within a segment cannot be (de)compressed
+
+--- !ELF
+FileHeader:
+  Class:      ELFCLASS64
+  Data:       ELFDATA2LSB
+  Type:       ET_EXEC
+  Machine:    EM_X86_64
+ProgramHeaders:
+  - Type:     PT_LOAD
+    FirstSec: .text
+    LastSec:  foo
+    Align:    0x1000
+    Offset:   0x1000
+Sections:
+  - Name:     .text
+    Type:     SHT_PROGBITS
+    Offset:   0x1000
+    Content:  C3
+  - Name:     foo
+    Type:     SHT_PROGBITS
+    Flags:    [ SHF_COMPRESSED ]
+    Content:  010000000000000040000000000000000100000000000000789cd36280002d3269002f800151
diff --git a/llvm/test/tools/llvm-objcopy/ELF/compress-sections.s b/llvm/test/tools/llvm-objcopy/ELF/compress-sections.s
new file mode 100644
index 0000000..e6fa860
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/ELF/compress-sections.s
@@ -0,0 +1,128 @@
+# REQUIRES: x86-registered-target, zlib, zstd
+
+# RUN: rm -rf %t && mkdir %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o a.o
+## '*0=none' wins because it is the last. '*0' sections are decompressed (if originally compressed) or kept unchanged (if uncompressed).
+## No section is named 'nomatch'. The third option is a no-op.
+# RUN: llvm-objcopy a.o out --compress-sections='*0=zlib' --compress-sections '*0=none' --compress-sections 'nomatch=none' 2>&1 | count 0
+# RUN: llvm-readelf -S out | FileCheck %s --check-prefix=CHECK1
+
+# CHECK1:      Name           Type          Address     Off      Size     ES Flg Lk Inf Al
+# CHECK1:      .text          PROGBITS [[#%x,TEXT:]]    [[#%x,]] [[#%x,]] 00 AX   0   0  4
+# CHECK1:      foo0           PROGBITS [[#%x,FOO0:]]    [[#%x,]] [[#%x,]] 00 A    0   0  8
+# CHECK1-NEXT: .relafoo0      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  I  11   3  8
+# CHECK1-NEXT: foo1           PROGBITS [[#%x,FOO1:]]    [[#%x,]] [[#%x,]] 00 A    0   0  8
+# CHECK1-NEXT: .relafoo1      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  I  11   5  8
+# CHECK1:      nonalloc0      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
+# CHECK1-NEXT: .relanonalloc0 RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  I  11   7  8
+# CHECK1-NEXT: nonalloc1      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
+# CHECK1-NEXT: .debug_str     PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01 MS   0   0  1
+
+## Mixing zlib and zstd.
+# RUN: llvm-objcopy a.o out2 --compress-sections '*c0=zlib' --compress-sections .debug_str=zstd
+# RUN: llvm-readelf -Sr -x nonalloc0 -x .debug_str out2 2>&1 | FileCheck %s --check-prefix=CHECK2
+# RUN: llvm-readelf -z -x nonalloc0 -x .debug_str out2 | FileCheck %s --check-prefix=CHECK2DE
+
+# CHECK2:      Name           Type          Address     Off      Size     ES Flg Lk Inf Al
+# CHECK2:      .text          PROGBITS [[#%x,TEXT:]]    [[#%x,]] [[#%x,]] 00 AX   0   0  4
+# CHECK2:      foo0           PROGBITS [[#%x,FOO0:]]    [[#%x,]] [[#%x,]] 00 A    0   0  8
+# CHECK2-NEXT: .relafoo0      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  I  11   3  8
+# CHECK2-NEXT: foo1           PROGBITS [[#%x,FOO1:]]    [[#%x,]] [[#%x,]] 00 A    0   0  8
+# CHECK2-NEXT: .relafoo1      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  I  11   5  8
+# CHECK2:      nonalloc0      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00   C  0   0  8
+# CHECK2-NEXT: .relanonalloc0 RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  IC 11   7  8
+# CHECK2-NEXT: nonalloc1      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
+# CHECK2-NEXT: .debug_str     PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01 MSC  0   0  8
+
+## llvm-readelf -r doesn't support SHF_COMPRESSED SHT_RELA.
+# CHECK2: warning: {{.*}}: unable to read relocations from SHT_RELA section with index 8: section [index 8] has an invalid sh_size ([[#]]) which is not a multiple of its sh_entsize (24)
+
+# CHECK2:      Hex dump of section 'nonalloc0':
+## zlib with ch_size=0x10
+# CHECK2-NEXT: 01000000 00000000 10000000 00000000
+# CHECK2-NEXT: 08000000 00000000 {{.*}}
+# CHECK2:      Hex dump of section '.debug_str':
+## zstd with ch_size=0x38
+# CHECK2-NEXT: 02000000 00000000 38000000 00000000
+# CHECK2-NEXT: 01000000 00000000 {{.*}}
+
+# CHECK2DE:       Hex dump of section 'nonalloc0':
+# CHECK2DE-NEXT:  0x00000000 00000000 00000000 00000000 00000000 ................
+# CHECK2DE-EMPTY:
+# CHECK2DE-NEXT:  Hex dump of section '.debug_str':
+# CHECK2DE-NEXT:  0x00000000 41414141 41414141 41414141 41414141 AAAAAAAAAAAAAAAA
+
+## --decompress-debug-sections takes precedence, even if it is before --compress-sections.
+# RUN: llvm-objcopy a.o out3 --decompress-debug-sections --compress-sections .debug_str=zstd
+# RUN: llvm-readelf -S out3 | FileCheck %s --check-prefix=CHECK3
+
+# CHECK3:      .debug_str PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01 MS   0   0  1
+
+# RUN: llvm-objcopy a.o out4 --compress-sections '*0=zlib'
+# RUN: llvm-readelf -S out4 | FileCheck %s --check-prefix=CHECK4
+
+# CHECK4:      Name           Type          Address     Off      Size     ES Flg Lk Inf Al
+# CHECK4:      .text          PROGBITS [[#%x,TEXT:]]    [[#%x,]] [[#%x,]] 00  AX  0   0  4
+# CHECK4:      foo0           PROGBITS [[#%x,FOO0:]]    [[#%x,]] [[#%x,]] 00  AC  0   0  8
+# CHECK4-NEXT: .relafoo0      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  IC 11   3  8
+# CHECK4-NEXT: foo1           PROGBITS [[#%x,FOO1:]]    [[#%x,]] [[#%x,]] 00   A  0   0  8
+# CHECK4-NEXT: .relafoo1      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18   I 11   5  8
+# CHECK4:      nonalloc0      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00   C  0   0  8
+# CHECK4-NEXT: .relanonalloc0 RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  IC 11   7  8
+# CHECK4-NEXT: nonalloc1      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
+# CHECK4-NEXT: .debug_str     PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01  MS  0   0  1
+
+## If a section is already compressed, compression request for another format is ignored.
+# RUN: llvm-objcopy a.o out5 --compress-sections 'nonalloc0=zlib'
+# RUN: llvm-readelf -x nonalloc0 out5 | FileCheck %s --check-prefix=CHECK5
+# RUN: llvm-objcopy out5 out5a --compress-sections 'nonalloc0=zstd'
+# RUN: cmp out5 out5a
+
+# CHECK5:      Hex dump of section 'nonalloc0':
+## zlib with ch_size=0x10
+# CHECK5-NEXT: 01000000 00000000 10000000 00000000
+# CHECK5-NEXT: 08000000 00000000 {{.*}}
+
+# RUN: not llvm-objcopy --compress-sections=foo a.o out 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERR1 --implicit-check-not=error:
+# ERR1:      error: --compress-sections: parse error, not 'section-glob=[none|zlib|zstd]'
+
+# RUN: llvm-objcopy --compress-sections 'a[=zlib' a.o out 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERR2 --implicit-check-not=error:
+# ERR2:      warning: invalid glob pattern, unmatched '['
+
+# RUN: not llvm-objcopy a.o out --compress-sections='.debug*=zlib-gabi' --compress-sections='.debug*=' 2>&1 | \
+# RUN:   FileCheck -check-prefix=ERR3 %s
+# ERR3:      error: invalid or unsupported --compress-sections format: .debug*=zlib-gabi
+
+# RUN: not llvm-objcopy a.o out --compress-sections='!.debug*=zlib' 2>&1 | \
+# RUN:   FileCheck -check-prefix=ERR4 %s
+# ERR4:      error: --compress-sections: negative pattern is unsupported
+
+.globl _start
+_start:
+  ret
+
+.section foo0,"a"
+.balign 8
+.quad .text-.
+.quad .text-.
+.section foo1,"a"
+.balign 8
+.quad .text-.
+.quad .text-.
+.section nonalloc0,""
+.balign 8
+.quad .text+1
+.quad .text+2
+sym0:
+.section nonalloc1,""
+.balign 8
+.quad 42
+sym1:
+
+.section .debug_str,"MS",@progbits,1
+.Linfo_string0:
+  .asciz "AAAAAAAAAAAAAAAAAAAAAAAAAAA"
+.Linfo_string1:
+  .asciz "BBBBBBBBBBBBBBBBBBBBBBBBBBB"
diff --git a/llvm/test/tools/llvm-objcopy/ELF/decompress-sections.test b/llvm/test/tools/llvm-objcopy/ELF/decompress-sections.test
index 4258ddb..d9f4f38 100644
--- a/llvm/test/tools/llvm-objcopy/ELF/decompress-sections.test
+++ b/llvm/test/tools/llvm-objcopy/ELF/decompress-sections.test
@@ -4,6 +4,8 @@
 # RUN: yaml2obj %s -o %t
 # RUN: llvm-objcopy --decompress-debug-sections %t %t.de
 # RUN: llvm-readelf -S %t.de | FileCheck %s
+# RUN: llvm-objcopy --compress-sections '*nonalloc=none' --compress-sections .debugx=none %t %t.1.de
+# RUN: cmp %t.de %t.1.de
 
 # CHECK:        Name              Type            Address          Off      Size     ES Flg Lk Inf Al
 # CHECK:        .debug_alloc      PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00  AC  0   0  0
@@ -11,6 +13,33 @@
 # CHECK-NEXT:   .debugx           PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  1
 # CHECK-NEXT:   nodebug           PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00   C  0   0  0
 
+# RUN: llvm-objcopy --compress-sections '.debug*=none' %t %t2.de
+# RUN: llvm-readelf -S -x .debug_alloc -x .debug_nonalloc -x .debugx %t2.de | FileCheck %s --check-prefix=CHECK2
+
+# CHECK2:        Name              Type            Address          Off      Size     ES Flg Lk Inf Al
+# CHECK2:        .debug_alloc      PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00  A   0   0  1
+# CHECK2-NEXT:   .debug_nonalloc   PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  1
+# CHECK2-NEXT:   .debugx           PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  1
+# CHECK2-NEXT:   nodebug           PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00   C  0   0  0
+
+# CHECK2:       Hex dump of section '.debug_alloc':
+# CHECK2-NEXT:  0x00000000 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000010 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000020 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000030 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-EMPTY:
+# CHECK2:       Hex dump of section '.debug_nonalloc':
+# CHECK2-NEXT:  0x00000000 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000010 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000020 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000030 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-EMPTY:
+# CHECK2-NEXT:  Hex dump of section '.debugx':
+# CHECK2-NEXT:  0x00000000 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000010 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000020 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000030 2a000000 00000000 2a000000 00000000 *.......*.......
+
 --- !ELF
 FileHeader:
   Class:   ELFCLASS64
diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-cov5.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-cov5.s
index ece36c6..5600bcd 100644
--- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-cov5.s
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-cov5.s
@@ -6,6 +6,13 @@
 ; RUN:   | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-xnack,+wavefrontsize32,-wavefrontsize64 -filetype=obj > %t.o
 ; RUN: llvm-objdump --disassemble-symbols=kernel.kd %t.o | FileCheck %s --check-prefixes=COV4,CHECK
 
+;; Make sure we override the default COV in the disassembler on COV6 (there
+;; currently aren't any differences between 5 and 6, so set the default to 4 so
+;; we can verify that the default is at least overridden)
+; RUN: sed 's/CODE_OBJECT_VERSION/6/g' %s \
+; RUN:   | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-xnack,+wavefrontsize32,-wavefrontsize64 -filetype=obj > %t.o
+; RUN: llvm-objdump -mllvm --amdhsa-code-object-version=4 --disassemble-symbols=kernel.kd %t.o | FileCheck %s --check-prefixes=COV5,CHECK
+
 ;; Verify that .amdhsa_uses_dynamic_stack is only printed on COV5+.
 
 ; CHECK: .amdhsa_kernel kernel
diff --git a/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-feature-pauth.s b/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-feature-pauth.s
index f28d92e..5125317 100644
--- a/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-feature-pauth.s
+++ b/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-feature-pauth.s
@@ -1,98 +1,204 @@
 # RUN: rm -rf %t && split-file %s %t && cd %t
 
-# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag.s       -o tag.o
-# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag-short.s -o tag-short.o
-# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag-long.s  -o tag-long.o
-
-# RUN: llvm-readelf --notes tag.o       | FileCheck --check-prefix NORMAL %s
-# RUN: llvm-readelf --notes tag-short.o | FileCheck --check-prefix SHORT  %s
-# RUN: llvm-readelf --notes tag-long.o  | FileCheck --check-prefix LONG   %s
-
-# NORMAL: AArch64 PAuth ABI tag: platform 0x2a, version 0x1
-# SHORT:  AArch64 PAuth ABI tag: <corrupted size: expected at least 16, got 12>
-# LONG:   AArch64 PAuth ABI tag: platform 0x2a, version 0x1, additional info 0xEFCDAB8967452301
-
-# RUN: llvm-readobj --notes tag.o       | FileCheck --check-prefix LLVM-NORMAL %s
-# RUN: llvm-readobj --notes tag-short.o | FileCheck --check-prefix LLVM-SHORT %s
-# RUN: llvm-readobj --notes tag-long.o  | FileCheck --check-prefix LLVM-LONG %s
-
-# LLVM-SHORT:      Notes [
-# LLVM-SHORT-NEXT:   NoteSection {
-# LLVM-SHORT-NEXT:     Name: .note.AARCH64-PAUTH-ABI-tag
-# LLVM-SHORT-NEXT:     Offset: 0x40
-# LLVM-SHORT-NEXT:     Size: 0x1C
-# LLVM-SHORT-NEXT:     Note {
-# LLVM-SHORT-NEXT:       Owner: ARM
-# LLVM-SHORT-NEXT:       Data size: 0xC
-# LLVM-SHORT-NEXT:       Type: NT_ARM_TYPE_PAUTH_ABI_TAG
-# LLVM-SHORT-NEXT:       Description data (
-# LLVM-SHORT-NEXT:         0000: 2A000000 00000000 01000000
-# LLVM-SHORT-NEXT:       )
-# LLVM-SHORT-NEXT:     }
-# LLVM-SHORT-NEXT:   }
-# LLVM-SHORT-NEXT: ]
-
-# LLVM-NORMAL:      Notes [
-# LLVM-NORMAL-NEXT:   NoteSection {
-# LLVM-NORMAL-NEXT:     Name: .note.AARCH64-PAUTH-ABI-tag
-# LLVM-NORMAL-NEXT:     Offset: 0x40
-# LLVM-NORMAL-NEXT:     Size: 0x20
-# LLVM-NORMAL-NEXT:     Note {
-# LLVM-NORMAL-NEXT:       Owner: ARM
-# LLVM-NORMAL-NEXT:       Data size: 0x10
-# LLVM-NORMAL-NEXT:       Type: NT_ARM_TYPE_PAUTH_ABI_TAG
-# LLVM-NORMAL-NEXT:       Platform: 42
-# LLVM-NORMAL-NEXT:       Version: 1
-# LLVM-NORMAL-NEXT:     }
-# LLVM-NORMAL-NEXT:   }
-# LLVM-NORMAL-NEXT: ]
-
-# LLVM-LONG:      Notes [
-# LLVM-LONG-NEXT:   NoteSection {
-# LLVM-LONG-NEXT:     Name: .note.AARCH64-PAUTH-ABI-tag
-# LLVM-LONG-NEXT:     Offset: 0x40
-# LLVM-LONG-NEXT:     Size: 0x28
-# LLVM-LONG-NEXT:     Note {
-# LLVM-LONG-NEXT:       Owner: ARM
-# LLVM-LONG-NEXT:       Data size: 0x18
-# LLVM-LONG-NEXT:       Type: NT_ARM_TYPE_PAUTH_ABI_TAG
-# LLVM-LONG-NEXT:       Platform: 42
-# LLVM-LONG-NEXT:       Version: 1
-# LLVM-LONG-NEXT:       Additional info: EFCDAB8967452301
-# LLVM-LONG-NEXT:     }
-# LLVM-LONG-NEXT:   }
-# LLVM-LONG-NEXT: ]
-
-#--- abi-tag.s
-
-.section ".note.AARCH64-PAUTH-ABI-tag", "a"
-.long 4
-.long 16
-.long 1
-.asciz "ARM"
-
-.quad 42         // platform
-.quad 1          // version
-
-#--- abi-tag-short.s
-
-.section ".note.AARCH64-PAUTH-ABI-tag", "a"
-.long 4
-.long 12
-.long 1
-.asciz "ARM"
-
-.quad 42
-.word 1
-
-#--- abi-tag-long.s
-
-.section ".note.AARCH64-PAUTH-ABI-tag", "a"
-.long 4
-.long 24
-.long 1
-.asciz "ARM"
-
-.quad 42         // platform
-.quad 1          // version
-.quad 0x0123456789ABCDEF // extra data
+#--- gnu-42-1.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 16          // Data size
+  .quad 42          // PAuth ABI platform
+  .quad 1           // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-42-1.s -o gnu-42-1.o
+# RUN: llvm-readelf --notes gnu-42-1.o | \
+# RUN:   FileCheck --check-prefix=ELF -DPLATFORM="0x2a (unknown)" -DVERSION=0x1 %s
+# RUN: llvm-readobj --notes gnu-42-1.o | \
+# RUN:   FileCheck --check-prefix=OBJ -DPLATFORM="0x2a (unknown)" -DVERSION=0x1 %s
+
+# ELF: Displaying notes found in: .note.gnu.property
+# ELF-NEXT:   Owner                 Data size	Description
+# ELF-NEXT:   GNU                   0x00000018	NT_GNU_PROPERTY_TYPE_0 (property note)
+# ELF-NEXT:   AArch64 PAuth ABI core info: platform [[PLATFORM]], version [[VERSION]]
+
+# OBJ:      Notes [
+# OBJ-NEXT:   NoteSection {
+# OBJ-NEXT:     Name: .note.gnu.property
+# OBJ-NEXT:     Offset: 0x40
+# OBJ-NEXT:     Size: 0x28
+# OBJ-NEXT:     Note {
+# OBJ-NEXT:       Owner: GNU
+# OBJ-NEXT:       Data size: 0x18
+# OBJ-NEXT:       Type: NT_GNU_PROPERTY_TYPE_0 (property note)
+# OBJ-NEXT:       Property [
+# OBJ-NEXT:         AArch64 PAuth ABI core info: platform [[PLATFORM]], version [[VERSION]]
+# OBJ-NEXT:       ]
+# OBJ-NEXT:     }
+# OBJ-NEXT:   }
+# OBJ-NEXT: ]
+
+#--- gnu-0-0.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 16          // Data size
+  .quad 0           // PAuth ABI platform
+  .quad 0           // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-0-0.s -o gnu-0-0.o
+# RUN: llvm-readelf --notes gnu-0-0.o | \
+# RUN:   FileCheck --check-prefix=ELF -DPLATFORM="0x0 (invalid)" -DVERSION=0x0 %s
+# RUN: llvm-readobj --notes gnu-0-0.o | \
+# RUN:   FileCheck --check-prefix=OBJ -DPLATFORM="0x0 (invalid)" -DVERSION=0x0 %s
+
+#--- gnu-1-0.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 16          // Data size
+  .quad 1           // PAuth ABI platform
+  .quad 0           // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-1-0.s -o gnu-1-0.o
+# RUN: llvm-readelf --notes gnu-1-0.o | \
+# RUN:   FileCheck --check-prefix=ELF -DPLATFORM="0x1 (baremetal)" -DVERSION=0x0 %s
+# RUN: llvm-readobj --notes gnu-1-0.o | \
+# RUN:   FileCheck --check-prefix=OBJ -DPLATFORM="0x1 (baremetal)" -DVERSION=0x0 %s
+
+#--- gnu-0x10000002-85.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 16          // Data size
+  .quad 0x10000002  // PAuth ABI platform
+  .quad 85          // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-0x10000002-85.s -o gnu-0x10000002-85.o
+# RUN: llvm-readelf --notes gnu-0x10000002-85.o | \
+# RUN:   FileCheck --check-prefix=ELF -DPLATFORM="0x10000002 (llvm_linux)" \
+# RUN:   -DVERSION="0x55 (PointerAuthIntrinsics, !PointerAuthCalls, PointerAuthReturns, !PointerAuthAuthTraps, PointerAuthVTPtrAddressDiscrimination, !PointerAuthVTPtrTypeDiscrimination, PointerAuthInitFini)" %s
+# RUN: llvm-readobj --notes gnu-0x10000002-85.o | \
+# RUN:   FileCheck --check-prefix=OBJ -DPLATFORM="0x10000002 (llvm_linux)" \
+# RUN:   -DVERSION="0x55 (PointerAuthIntrinsics, !PointerAuthCalls, PointerAuthReturns, !PointerAuthAuthTraps, PointerAuthVTPtrAddressDiscrimination, !PointerAuthVTPtrTypeDiscrimination, PointerAuthInitFini)" %s
+
+#--- gnu-0x10000002-128.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 16          // Data size
+  .quad 0x10000002  // PAuth ABI platform
+  .quad 128         // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-0x10000002-128.s -o gnu-0x10000002-128.o
+# RUN: llvm-readelf --notes gnu-0x10000002-128.o | \
+# RUN:   FileCheck --check-prefix=ELF -DPLATFORM="0x10000002 (llvm_linux)" -DVERSION="0x80 (unknown)" %s
+# RUN: llvm-readobj --notes gnu-0x10000002-128.o | \
+# RUN:   FileCheck --check-prefix=OBJ -DPLATFORM="0x10000002 (llvm_linux)" -DVERSION="0x80 (unknown)" %s
+
+#--- gnu-short.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 12          // Data size
+  .quad 42          // PAuth ABI platform
+  .word 1           // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-short.s -o gnu-short.o
+# RUN: llvm-readelf --notes gnu-short.o | \
+# RUN:   FileCheck --check-prefix=ELF-ERR -DSIZE=28 -DDATASIZE=18 \
+# RUN:   -DERR="<corrupted size: expected 16, got 12>" %s
+# RUN: llvm-readobj --notes gnu-short.o | \
+# RUN:   FileCheck --check-prefix=OBJ-ERR -DSIZE=28 -DDATASIZE=18 \
+# RUN:   -DERR="<corrupted size: expected 16, got 12>" %s
+
+# ELF-ERR: Displaying notes found in: .note.gnu.property
+# ELF-ERR-NEXT:   Owner                 Data size	Description
+# ELF-ERR-NEXT:   GNU                   0x000000[[DATASIZE]]	NT_GNU_PROPERTY_TYPE_0 (property note)
+# ELF-ERR-NEXT:   AArch64 PAuth ABI core info: [[ERR]]
+
+# OBJ-ERR:      Notes [
+# OBJ-ERR-NEXT:   NoteSection {
+# OBJ-ERR-NEXT:     Name: .note.gnu.property
+# OBJ-ERR-NEXT:     Offset: 0x40
+# OBJ-ERR-NEXT:     Size: 0x[[SIZE]]
+# OBJ-ERR-NEXT:     Note {
+# OBJ-ERR-NEXT:       Owner: GNU
+# OBJ-ERR-NEXT:       Data size: 0x[[DATASIZE]]
+# OBJ-ERR-NEXT:       Type: NT_GNU_PROPERTY_TYPE_0 (property note)
+# OBJ-ERR-NEXT:       Property [
+# OBJ-ERR-NEXT:         AArch64 PAuth ABI core info: [[ERR]]
+# OBJ-ERR-NEXT:       ]
+# OBJ-ERR-NEXT:     }
+# OBJ-ERR-NEXT:   }
+# OBJ-ERR-NEXT: ]
+
+#--- gnu-long.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 24          // Data size
+  .quad 42          // PAuth ABI platform
+  .quad 1           // PAuth ABI version
+  .quad 0x0123456789ABCDEF
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-long.s -o gnu-long.o
+# RUN: llvm-readelf --notes gnu-long.o | \
+# RUN:   FileCheck --check-prefix=ELF-ERR -DSIZE=30 -DDATASIZE=20 \
+# RUN:   -DERR="<corrupted size: expected 16, got 24>" %s
+# RUN: llvm-readobj --notes gnu-long.o | \
+# RUN:   FileCheck --check-prefix=OBJ-ERR -DSIZE=30 -DDATASIZE=20 \
+# RUN:   -DERR="<corrupted size: expected 16, got 24>" %s
diff --git a/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-note-gnu-property.s b/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-note-gnu-property.s
index 377e6f9..b517f0b 100644
--- a/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-note-gnu-property.s
+++ b/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-note-gnu-property.s
@@ -1,3 +1,5 @@
+// See tests for GNU_PROPERTY_AARCH64_FEATURE_PAUTH in aarch64-feature-pauth.s
+
 // RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu %s -o %t
 // RUN: llvm-readelf --notes %t | FileCheck %s --check-prefix=GNU
 // RUN: llvm-readobj --notes %t | FileCheck %s --check-prefix=LLVM
diff --git a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
index 677dfc4..7246ba4 100644
--- a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
+++ b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
@@ -857,7 +857,9 @@ bool DwarfLinkerForBinary::linkImpl(
       return error(toString(std::move(E)));
   }
 
-  if (Map.getTriple().isOSDarwin() && !Map.getBinaryPath().empty() &&
+  auto MapTriple = Map.getTriple();
+  if ((MapTriple.isOSDarwin() || MapTriple.isOSBinFormatMachO()) &&
+      !Map.getBinaryPath().empty() &&
       ObjectType == Linker::OutputFileType::Object)
     return MachOUtils::generateDsymCompanion(
         Options.VFS, Map, *Streamer->getAsmPrinter().OutStreamer, OutFile,
diff --git a/llvm/tools/gold/CMakeLists.txt b/llvm/tools/gold/CMakeLists.txt
index 58b3238..5c78529 100644
--- a/llvm/tools/gold/CMakeLists.txt
+++ b/llvm/tools/gold/CMakeLists.txt
@@ -12,7 +12,7 @@ if( LLVM_ENABLE_PIC AND LLVM_BINUTILS_INCDIR )
      TargetParser
      )
 
-  add_llvm_library(LLVMgold MODULE
+  add_llvm_library(LLVMgold MODULE INSTALL_WITH_TOOLCHAIN
     gold-plugin.cpp
     )
 
diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp
index 49154dc..6ad1c99 100644
--- a/llvm/tools/llvm-dis/llvm-dis.cpp
+++ b/llvm/tools/llvm-dis/llvm-dis.cpp
@@ -258,12 +258,8 @@ int main(int argc, char **argv) {
       // All that llvm-dis does is write the assembly to a file.
       if (!DontPrint) {
         if (M) {
-          bool ChangeDbgFormat = M->IsNewDbgInfoFormat != WriteNewDbgInfoFormat;
-          if (ChangeDbgFormat)
-            M->setIsNewDbgInfoFormat(WriteNewDbgInfoFormat);
+          ScopedDbgInfoFormatSetter FormatSetter(*M, WriteNewDbgInfoFormat);
           M->print(Out->os(), Annotator.get(), PreserveAssemblyUseListOrder);
-          if (ChangeDbgFormat)
-            M->setIsNewDbgInfoFormat(!WriteNewDbgInfoFormat);
         }
         if (Index)
           Index->print(Out->os());
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
index 7269c51..70e8546 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -736,6 +736,42 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
       return createStringError(errc::invalid_argument, Reason);
   }
 
+  for (const auto *A : InputArgs.filtered(OBJCOPY_compress_sections)) {
+    SmallVector<StringRef, 0> Fields;
+    StringRef(A->getValue()).split(Fields, '=');
+    if (Fields.size() != 2 || Fields[1].empty()) {
+      return createStringError(
+          errc::invalid_argument,
+          A->getSpelling() +
+              ": parse error, not 'section-glob=[none|zlib|zstd]'");
+    }
+
+    auto Type = StringSwitch<DebugCompressionType>(Fields[1])
+                    .Case("zlib", DebugCompressionType::Zlib)
+                    .Case("zstd", DebugCompressionType::Zstd)
+                    .Default(DebugCompressionType::None);
+    if (Type == DebugCompressionType::None && Fields[1] != "none") {
+      return createStringError(
+          errc::invalid_argument,
+          "invalid or unsupported --compress-sections format: %s",
+          A->getValue());
+    }
+
+    auto &P = Config.compressSections.emplace_back();
+    P.second = Type;
+    auto Matcher =
+        NameOrPattern::create(Fields[0], SectionMatchStyle, ErrorCallback);
+    // =none allows overriding a previous =zlib or =zstd. Reject negative
+    // patterns, which would be confusing.
+    if (Matcher && !Matcher->isPositiveMatch()) {
+      return createStringError(
+          errc::invalid_argument,
+          "--compress-sections: negative pattern is unsupported");
+    }
+    if (Error E = P.first.addMatcher(std::move(Matcher)))
+      return std::move(E);
+  }
+
   Config.AddGnuDebugLink = InputArgs.getLastArgValue(OBJCOPY_add_gnu_debuglink);
   // The gnu_debuglink's target is expected to not change or else its CRC would
   // become invalidated and get rejected. We can avoid recalculating the
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOpts.td b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
index be02616..4bc80eb 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOpts.td
+++ b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
@@ -35,6 +35,12 @@ def : Flag<["--"], "compress-debug-sections">, Alias<compress_debug_sections>,
       AliasArgs<["zlib"]>;
 def decompress_debug_sections : Flag<["--"], "decompress-debug-sections">,
                                 HelpText<"Decompress DWARF debug sections">;
+defm compress_sections
+    : Eq<"compress-sections",
+         "Compress or decompress sections using specified format. Supported "
+         "formats: zlib, zstd. Specify 'none' for decompression">,
+      MetaVarName<"<section-glob>=<format>">;
+
 defm split_dwo
     : Eq<"split-dwo", "Equivalent to --extract-dwo and <dwo-file> as the output file and no other options, "
                       "and then --strip-dwo on the input file">,
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 4b406ef..d353482 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -61,6 +61,7 @@
 #include "llvm/Support/SystemZ/zOSSupport.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <array>
 #include <cinttypes>
 #include <cstddef>
 #include <cstdint>
@@ -5105,6 +5106,73 @@ template <class ELFT> void GNUELFDumper<ELFT>::printAddrsig() {
   }
 }
 
+template <class ELFT>
+static bool printAArch64PAuthABICoreInfo(raw_ostream &OS, uint32_t DataSize,
+                                         ArrayRef<uint8_t> Desc) {
+  OS << "    AArch64 PAuth ABI core info: ";
+  // DataSize - size without padding, Desc.size() - size with padding
+  if (DataSize != 16) {
+    OS << format("<corrupted size: expected 16, got %d>", DataSize);
+    return false;
+  }
+
+  uint64_t Platform =
+      support::endian::read64<ELFT::Endianness>(Desc.data() + 0);
+  uint64_t Version = support::endian::read64<ELFT::Endianness>(Desc.data() + 8);
+
+  const char *PlatformDesc = [Platform]() {
+    switch (Platform) {
+    case AARCH64_PAUTH_PLATFORM_INVALID:
+      return "invalid";
+    case AARCH64_PAUTH_PLATFORM_BAREMETAL:
+      return "baremetal";
+    case AARCH64_PAUTH_PLATFORM_LLVM_LINUX:
+      return "llvm_linux";
+    default:
+      return "unknown";
+    }
+  }();
+
+  std::string VersionDesc = [Platform, Version]() -> std::string {
+    if (Platform != AARCH64_PAUTH_PLATFORM_LLVM_LINUX)
+      return "";
+    if (Version >= (1 << (AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_LAST + 1)))
+      return "unknown";
+
+    std::array<StringRef, AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_LAST + 1>
+        Flags;
+    Flags[AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_INTRINSICS] = "Intrinsics";
+    Flags[AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_CALLS] = "Calls";
+    Flags[AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_RETURNS] = "Returns";
+    Flags[AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_AUTHTRAPS] = "AuthTraps";
+    Flags[AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_VPTRADDRDISCR] =
+        "VTPtrAddressDiscrimination";
+    Flags[AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_VPTRTYPEDISCR] =
+        "VTPtrTypeDiscrimination";
+    Flags[AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_INITFINI] = "InitFini";
+
+    static_assert(AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_INITFINI ==
+                      AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_LAST,
+                  "Update when new enum items are defined");
+
+    std::string Desc;
+    for (uint32_t I = 0, End = Flags.size(); I < End; ++I) {
+      if (!(Version & (1ULL << I)))
+        Desc += '!';
+      Desc +=
+          Twine("PointerAuth" + Flags[I] + (I == End - 1 ? "" : ", ")).str();
+    }
+    return Desc;
+  }();
+
+  OS << format("platform 0x%" PRIx64 " (%s), version 0x%" PRIx64, Platform,
+               PlatformDesc, Version);
+  if (!VersionDesc.empty())
+    OS << format(" (%s)", VersionDesc.c_str());
+
+  return true;
+}
+
 template <typename ELFT>
 static std::string getGNUProperty(uint32_t Type, uint32_t DataSize,
                                   ArrayRef<uint8_t> Data) {
@@ -5162,6 +5230,9 @@ static std::string getGNUProperty(uint32_t Type, uint32_t DataSize,
     if (PrData)
       OS << format("<unknown flags: 0x%x>", PrData);
     return OS.str();
+  case GNU_PROPERTY_AARCH64_FEATURE_PAUTH:
+    printAArch64PAuthABICoreInfo<ELFT>(OS, DataSize, Data);
+    return OS.str();
   case GNU_PROPERTY_X86_FEATURE_2_NEEDED:
   case GNU_PROPERTY_X86_FEATURE_2_USED:
     OS << "x86 feature "
@@ -5364,29 +5435,6 @@ static bool printAndroidNote(raw_ostream &OS, uint32_t NoteType,
 }
 
 template <class ELFT>
-static bool printAArch64Note(raw_ostream &OS, uint32_t NoteType,
-                             ArrayRef<uint8_t> Desc) {
-  if (NoteType != NT_ARM_TYPE_PAUTH_ABI_TAG)
-    return false;
-
-  OS << "    AArch64 PAuth ABI tag: ";
-  if (Desc.size() < 16) {
-    OS << format("<corrupted size: expected at least 16, got %d>", Desc.size());
-    return false;
-  }
-
-  uint64_t Platform = endian::read64<ELFT::Endianness>(Desc.data() + 0);
-  uint64_t Version = endian::read64<ELFT::Endianness>(Desc.data() + 8);
-  OS << format("platform 0x%" PRIx64 ", version 0x%" PRIx64, Platform, Version);
-
-  if (Desc.size() > 16)
-    OS << ", additional info 0x"
-       << toHex(ArrayRef<uint8_t>(Desc.data() + 16, Desc.size() - 16));
-
-  return true;
-}
-
-template <class ELFT>
 void GNUELFDumper<ELFT>::printMemtag(
     const ArrayRef<std::pair<std::string, std::string>> DynamicEntries,
     const ArrayRef<uint8_t> AndroidNoteDesc,
@@ -5783,10 +5831,6 @@ const NoteType AndroidNoteTypes[] = {
      "NT_ANDROID_TYPE_MEMTAG (Android memory tagging information)"},
 };
 
-const NoteType ARMNoteTypes[] = {
-    {ELF::NT_ARM_TYPE_PAUTH_ABI_TAG, "NT_ARM_TYPE_PAUTH_ABI_TAG"},
-};
-
 const NoteType CoreNoteTypes[] = {
     {ELF::NT_PRSTATUS, "NT_PRSTATUS (prstatus structure)"},
     {ELF::NT_FPREGSET, "NT_FPREGSET (floating point registers)"},
@@ -5905,8 +5949,6 @@ StringRef getNoteTypeName(const typename ELFT::Note &Note, unsigned ELFType) {
     return FindNote(LLVMOMPOFFLOADNoteTypes);
   if (Name == "Android")
     return FindNote(AndroidNoteTypes);
-  if (Name == "ARM")
-    return FindNote(ARMNoteTypes);
 
   if (ELFType == ELF::ET_CORE)
     return FindNote(CoreNoteTypes);
@@ -6062,9 +6104,6 @@ template <class ELFT> void GNUELFDumper<ELFT>::printNotes() {
     } else if (Name == "Android") {
       if (printAndroidNote(OS, Type, Descriptor))
         return Error::success();
-    } else if (Name == "ARM") {
-      if (printAArch64Note<ELFT>(OS, Type, Descriptor))
-        return Error::success();
     }
     if (!Descriptor.empty()) {
       OS << "   description data:";
@@ -7703,27 +7742,6 @@ static bool printAndroidNoteLLVMStyle(uint32_t NoteType, ArrayRef<uint8_t> Desc,
 }
 
 template <class ELFT>
-static bool printAarch64NoteLLVMStyle(uint32_t NoteType, ArrayRef<uint8_t> Desc,
-                                      ScopedPrinter &W) {
-  if (NoteType != NT_ARM_TYPE_PAUTH_ABI_TAG)
-    return false;
-
-  if (Desc.size() < 16)
-    return false;
-
-  uint64_t platform = endian::read64<ELFT::Endianness>(Desc.data() + 0);
-  uint64_t version = endian::read64<ELFT::Endianness>(Desc.data() + 8);
-  W.printNumber("Platform", platform);
-  W.printNumber("Version", version);
-
-  if (Desc.size() > 16)
-    W.printString("Additional info",
-                  toHex(ArrayRef<uint8_t>(Desc.data() + 16, Desc.size() - 16)));
-
-  return true;
-}
-
-template <class ELFT>
 void LLVMELFDumper<ELFT>::printMemtag(
     const ArrayRef<std::pair<std::string, std::string>> DynamicEntries,
     const ArrayRef<uint8_t> AndroidNoteDesc,
@@ -7859,9 +7877,6 @@ template <class ELFT> void LLVMELFDumper<ELFT>::printNotes() {
     } else if (Name == "Android") {
       if (printAndroidNoteLLVMStyle(Type, Descriptor, W))
         return Error::success();
-    } else if (Name == "ARM") {
-      if (printAarch64NoteLLVMStyle<ELFT>(Type, Descriptor, W))
-        return Error::success();
     }
     if (!Descriptor.empty()) {
       W.printBinaryBlock("Description data", Descriptor);
diff --git a/llvm/unittests/ADT/APIntTest.cpp b/llvm/unittests/ADT/APIntTest.cpp
index d5ef63e..76fc264 100644
--- a/llvm/unittests/ADT/APIntTest.cpp
+++ b/llvm/unittests/ADT/APIntTest.cpp
@@ -3249,21 +3249,11 @@ TEST(APIntTest, SolveQuadraticEquationWrap) {
 }
 
 TEST(APIntTest, MultiplicativeInverseExaustive) {
-  for (unsigned BitWidth = 1; BitWidth <= 16; ++BitWidth) {
-    for (unsigned Value = 0; Value < (1u << BitWidth); ++Value) {
+  for (unsigned BitWidth = 1; BitWidth <= 8; ++BitWidth) {
+    for (unsigned Value = 1; Value < (1u << BitWidth); Value += 2) {
+      // Multiplicative inverse exists for all odd numbers.
       APInt V = APInt(BitWidth, Value);
-      APInt MulInv =
-          V.zext(BitWidth + 1)
-              .multiplicativeInverse(APInt::getSignedMinValue(BitWidth + 1))
-              .trunc(BitWidth);
-      APInt One = V * MulInv;
-      if (!V.isZero() && V.countr_zero() == 0) {
-        // Multiplicative inverse exists for all odd numbers.
-        EXPECT_TRUE(One.isOne());
-      } else {
-        // Multiplicative inverse does not exist for even numbers (and 0).
-        EXPECT_TRUE(MulInv.isZero());
-      }
+      EXPECT_EQ(V * V.multiplicativeInverse(), 1);
     }
   }
 }
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index 1cca44e..f1aa6f3 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -265,7 +265,9 @@ TEST(MemProf, PortableWrapper) {
   EXPECT_EQ(3UL, ReadBlock.getAllocCpuId());
 }
 
-TEST(MemProf, RecordSerializationRoundTrip) {
+// Version0 and Version1 serialize IndexedMemProfRecord in the same format, so
+// we share one test.
+TEST(MemProf, RecordSerializationRoundTripVersion0And1) {
   const MemProfSchema Schema = getFullSchema();
 
   MemInfoBlock Info(/*size=*/16, /*access_count=*/7, /*alloc_timestamp=*/1000,
@@ -284,14 +286,47 @@ TEST(MemProf, RecordSerializationRoundTrip) {
                                    Info);
   }
   Record.CallSites.assign(CallSites);
+  for (const auto &CS : CallSites)
+    Record.CallSiteIds.push_back(llvm::memprof::hashCallStack(CS));
 
   std::string Buffer;
   llvm::raw_string_ostream OS(Buffer);
-  Record.serialize(Schema, OS);
+  Record.serialize(Schema, OS, llvm::memprof::Version0);
   OS.flush();
 
   const IndexedMemProfRecord GotRecord = IndexedMemProfRecord::deserialize(
-      Schema, reinterpret_cast<const unsigned char *>(Buffer.data()));
+      Schema, reinterpret_cast<const unsigned char *>(Buffer.data()),
+      llvm::memprof::Version0);
+
+  EXPECT_EQ(Record, GotRecord);
+}
+
+TEST(MemProf, RecordSerializationRoundTripVerion2) {
+  const MemProfSchema Schema = getFullSchema();
+
+  MemInfoBlock Info(/*size=*/16, /*access_count=*/7, /*alloc_timestamp=*/1000,
+                    /*dealloc_timestamp=*/2000, /*alloc_cpu=*/3,
+                    /*dealloc_cpu=*/4);
+
+  llvm::SmallVector<llvm::memprof::CallStackId> CallStackIds = {0x123, 0x456};
+
+  llvm::SmallVector<llvm::memprof::CallStackId> CallSiteIds = {0x333, 0x444};
+
+  IndexedMemProfRecord Record;
+  for (const auto &CSId : CallStackIds) {
+    // Use the same info block for both allocation sites.
+    Record.AllocSites.emplace_back(llvm::SmallVector<FrameId>(), CSId, Info);
+  }
+  Record.CallSiteIds.assign(CallSiteIds);
+
+  std::string Buffer;
+  llvm::raw_string_ostream OS(Buffer);
+  Record.serialize(Schema, OS, llvm::memprof::Version2);
+  OS.flush();
+
+  const IndexedMemProfRecord GotRecord = IndexedMemProfRecord::deserialize(
+      Schema, reinterpret_cast<const unsigned char *>(Buffer.data()),
+      llvm::memprof::Version2);
 
   EXPECT_EQ(Record, GotRecord);
 }
diff --git a/llvm/unittests/TextAPI/TextStubV5Tests.cpp b/llvm/unittests/TextAPI/TextStubV5Tests.cpp
index c77d13e..62fdd79 100644
--- a/llvm/unittests/TextAPI/TextStubV5Tests.cpp
+++ b/llvm/unittests/TextAPI/TextStubV5Tests.cpp
@@ -722,7 +722,7 @@ TEST(TBDv5, WriteFile) {
   File.setInstallName("@rpath/S/L/F/Foo.framework/Foo");
   File.setCurrentVersion(PackedVersion(1, 2, 0));
   File.setCompatibilityVersion(PackedVersion(1, 1, 0));
-  File.addRPath(AllTargets[0], "@executable_path/.../Frameworks");
+  File.addRPath("@executable_path/.../Frameworks", AllTargets[0]);
 
   for (const auto &Targ : AllTargets) {
     File.addParentUmbrella(Targ, "System");
@@ -897,7 +897,7 @@ TEST(TBDv5, WriteMultipleDocuments) {
   NestedFile.setTwoLevelNamespace();
   NestedFile.setApplicationExtensionSafe(false);
   NestedFile.setCurrentVersion(PackedVersion(2, 1, 1));
-  NestedFile.addRPath(AllTargets[0], "@executable_path/.../Frameworks");
+  NestedFile.addRPath("@executable_path/.../Frameworks", AllTargets[0]);
   for (const auto &Targ : AllTargets)
     NestedFile.addReexportedLibrary("@rpath/libfoo.dylib", Targ);
   NestedFile.addSymbol(EncodeKind::GlobalSymbol, "_funcFoo", AllTargets,
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
index 076d042..7a5d2be 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
@@ -3858,8 +3858,10 @@ void CodeGenDAGPatterns::parseInstructionPattern(CodeGenInstruction &CGI,
   for (unsigned i = NumResults, e = CGI.Operands.size(); i != e; ++i) {
     CGIOperandList::OperandInfo &Op = CGI.Operands[i];
     const std::string &OpName = Op.Name;
-    if (OpName.empty())
+    if (OpName.empty()) {
       I.error("Operand #" + Twine(i) + " in operands list has no name!");
+      continue;
+    }
 
     if (!InstInputs.count(OpName)) {
       // If this is an operand with a DefaultOps set filled in, we can ignore
@@ -3872,16 +3874,19 @@ void CodeGenDAGPatterns::parseInstructionPattern(CodeGenInstruction &CGI,
       }
       I.error("Operand $" + OpName +
               " does not appear in the instruction pattern");
+      continue;
     }
     TreePatternNodePtr InVal = InstInputs[OpName];
     InstInputs.erase(OpName); // It occurred, remove from map.
 
     if (InVal->isLeaf() && isa<DefInit>(InVal->getLeafValue())) {
       Record *InRec = cast<DefInit>(InVal->getLeafValue())->getDef();
-      if (!checkOperandClass(Op, InRec))
+      if (!checkOperandClass(Op, InRec)) {
         I.error("Operand $" + OpName +
                 "'s register class disagrees"
                 " between the operand and pattern");
+        continue;
+      }
     }
     Operands.push_back(Op.Rec);
 
diff --git a/llvm/utils/TableGen/X86ManualCompressEVEXTables.def b/llvm/utils/TableGen/X86ManualCompressEVEXTables.def
index 77cf65b..665a394 100644
--- a/llvm/utils/TableGen/X86ManualCompressEVEXTables.def
+++ b/llvm/utils/TableGen/X86ManualCompressEVEXTables.def
@@ -197,12 +197,12 @@ ENTRY(VPUNPCKLQDQZ128rm, VPUNPCKLQDQrm)
 ENTRY(VPUNPCKLQDQZ128rr, VPUNPCKLQDQrr)
 ENTRY(VPXORQZ128rm, VPXORrm)
 ENTRY(VPXORQZ128rr, VPXORrr)
-ENTRY(VRNDSCALEPDZ128rmi, VROUNDPDm)
-ENTRY(VRNDSCALEPDZ128rri, VROUNDPDr)
-ENTRY(VRNDSCALESDZm, VROUNDSDm)
-ENTRY(VRNDSCALESDZm_Int, VROUNDSDm_Int)
-ENTRY(VRNDSCALESDZr, VROUNDSDr)
-ENTRY(VRNDSCALESDZr_Int, VROUNDSDr_Int)
+ENTRY(VRNDSCALEPDZ128rmi, VROUNDPDmi)
+ENTRY(VRNDSCALEPDZ128rri, VROUNDPDri)
+ENTRY(VRNDSCALESDZm, VROUNDSDmi)
+ENTRY(VRNDSCALESDZm_Int, VROUNDSDmi_Int)
+ENTRY(VRNDSCALESDZr, VROUNDSDri)
+ENTRY(VRNDSCALESDZr_Int, VROUNDSDri_Int)
 ENTRY(VSHUFPDZ128rmi, VSHUFPDrmi)
 ENTRY(VSHUFPDZ128rri, VSHUFPDrri)
 ENTRY(VSQRTPDZ128m, VSQRTPDm)
@@ -306,8 +306,8 @@ ENTRY(VPUNPCKLQDQZ256rm, VPUNPCKLQDQYrm)
 ENTRY(VPUNPCKLQDQZ256rr, VPUNPCKLQDQYrr)
 ENTRY(VPXORQZ256rm, VPXORYrm)
 ENTRY(VPXORQZ256rr, VPXORYrr)
-ENTRY(VRNDSCALEPDZ256rmi, VROUNDPDYm)
-ENTRY(VRNDSCALEPDZ256rri, VROUNDPDYr)
+ENTRY(VRNDSCALEPDZ256rmi, VROUNDPDYmi)
+ENTRY(VRNDSCALEPDZ256rri, VROUNDPDYri)
 ENTRY(VSHUFPDZ256rmi, VSHUFPDYrmi)
 ENTRY(VSHUFPDZ256rri, VSHUFPDYrri)
 ENTRY(VSQRTPDZ256m, VSQRTPDYm)
diff --git a/llvm/utils/gn/secondary/bolt/lib/Profile/BUILD.gn b/llvm/utils/gn/secondary/bolt/lib/Profile/BUILD.gn
index 5fead24..dc85fb0 100644
--- a/llvm/utils/gn/secondary/bolt/lib/Profile/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/lib/Profile/BUILD.gn
@@ -12,7 +12,6 @@ static_library("Profile") {
     "DataAggregator.cpp",
     "DataReader.cpp",
     "Heatmap.cpp",
-    "ProfileReaderBase.cpp",
     "StaleProfileMatching.cpp",
     "YAMLProfileReader.cpp",
     "YAMLProfileWriter.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
index 33fdecf..59dc38c 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
@@ -25,6 +25,7 @@ static_library("readability") {
     "DeleteNullPointerCheck.cpp",
     "DuplicateIncludeCheck.cpp",
     "ElseAfterReturnCheck.cpp",
+    "EnumInitialValueCheck.cpp",
     "FunctionCognitiveComplexityCheck.cpp",
     "FunctionSizeCheck.cpp",
     "IdentifierLengthCheck.cpp",
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 8a2ab18..78a6e6f 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -352,6 +352,7 @@ if (current_toolchain == default_toolchain) {
       "__chrono/formatter.h",
       "__chrono/hh_mm_ss.h",
       "__chrono/high_resolution_clock.h",
+      "__chrono/leap_second.h",
       "__chrono/literals.h",
       "__chrono/month.h",
       "__chrono/month_weekday.h",
@@ -778,12 +779,12 @@ if (current_toolchain == default_toolchain) {
       "__tree",
       "__tuple/find_index.h",
       "__tuple/make_tuple_types.h",
-      "__tuple/pair_like.h",
       "__tuple/sfinae_helpers.h",
       "__tuple/tuple_element.h",
       "__tuple/tuple_indices.h",
       "__tuple/tuple_like.h",
       "__tuple/tuple_like_ext.h",
+      "__tuple/tuple_like_no_subrange.h",
       "__tuple/tuple_size.h",
       "__tuple/tuple_types.h",
       "__type_traits/add_const.h",
diff --git a/llvm/utils/gn/secondary/libcxx/src/BUILD.gn b/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
index 5530972..90f6f5d 100644
--- a/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
@@ -315,6 +315,7 @@ if (libcxx_enable_experimental) {
     sources = [ "experimental/keep.cpp" ]
     if (libcxx_enable_filesystem && libcxx_enable_time_zone_database) {
       sources += [
+        "include/tzdb/leap_second_private.h",
         "include/tzdb/time_zone_link_private.h",
         "include/tzdb/time_zone_private.h",
         "include/tzdb/types_private.h",
diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn
index 15766d4..fba8118 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn
@@ -17,6 +17,7 @@ static_library("GlobalISel") {
     "CallLowering.cpp",
     "Combiner.cpp",
     "CombinerHelper.cpp",
+    "CombinerHelperVectorOps.cpp",
     "GIMatchTableExecutor.cpp",
     "GISelChangeObserver.cpp",
     "GISelKnownBits.cpp",