107 files changed, 2706 insertions, 3478 deletions
diff --git a/bolt/unittests/Profile/PerfSpeEvents.cpp b/bolt/unittests/Profile/PerfSpeEvents.cpp
index 8d023cd..4f060cd 100644
--- a/bolt/unittests/Profile/PerfSpeEvents.cpp
+++ b/bolt/unittests/Profile/PerfSpeEvents.cpp
@@ -161,4 +161,92 @@ TEST_F(PerfSpeEventsTestHelper, SpeBranchesWithBrstack) {
   parseAndCheckBrstackEvents(1234, ExpectedSamples);
 }
 
+TEST_F(PerfSpeEventsTestHelper, SpeBranchesWithBrstackAndPbt) {
+  // Check perf input with SPE branch events as brstack format by
+  // combining with the previous branch target address (named as PBT).
+  // Example collection command:
+  // ```
+  // perf record -e 'arm_spe_0/branch_filter=1/u' -- BINARY
+  // ```
+  // How Bolt extracts the branch events:
+  // ```
+  // perf script -F pid,brstack --itrace=bl
+  // ```
+
+  opts::ArmSPE = true;
+  opts::ReadPerfEvents =
+      // "<PID> <SRC>/<DEST>/PN/-/-/10/COND/- <NULL>/<PBT>/-/-/-/0//-\n"
+      "  4567  0xa002/0xa003/PN/-/-/10/COND/- 0x0/0xa001/-/-/-/0//-\n"
+      "  4567  0xb002/0xb003/P/-/-/4/RET/- 0x0/0xb001/-/-/-/0//-\n"
+      "  4567  0xc456/0xc789/P/-/-/13/-/- 0x0/0xc123/-/-/-/0//-\n"
+      "  4567  0xd456/0xd789/M/-/-/7/RET/- 0x0/0xd123/-/-/-/0//-\n"
+      "  4567  0xe005/0xe009/P/-/-/14/RET/- 0x0/0xe001/-/-/-/0//-\n"
+      "  4567  0xd456/0xd789/M/-/-/7/RET/- 0x0/0xd123/-/-/-/0//-\n"
+      "  4567  0xf002/0xf003/MN/-/-/8/COND/- 0x0/0xf001/-/-/-/0//-\n"
+      "  4567  0xc456/0xc789/P/-/-/13/-/- 0x0/0xc123/-/-/-/0//-\n";
+
+  // ExpectedSamples contains the aggregated information about
+  // a branch {{From, To, TraceTo}, {TakenCount, MispredCount}}.
+  // Where
+  // - From: is the source address of the sampled branch operation.
+  // - To: is the target address of the sampled branch operation.
+  // - TraceTo could be either
+  //    - A 'Type = Trace::BR_ONLY', which means the trace only contains branch
+  //    data.
+  //    - Or an address, when the trace contains information about the previous
+  //    branch.
+  //
+  // When FEAT_SPE_PBT is present, Arm SPE emits two records per sample:
+  // - the current branch (Spe.From/Spe.To), and
+  // - the previous taken branch target (PBT) (PBT.From, PBT.To).
+  //
+  // Together they behave like a depth-1 branch stack where:
+  //   - the PBT entry is always taken
+  //   - the current branch entry may represent a taken branch or a fall-through
+  //   - the destination (Spe.To) is the architecturally executed target
+  //
+  // There can be fall-throughs to be inferred between the PBT entry and
+  // the current branch (Spe.From), but there cannot be between current
+  // branch's (Spe.From/Spe.To).
+  //
+  // PBT records only the target address (PBT.To), meaning we have no
+  // information as the branch source (PBT.From=0x0), branch type, and the
+  // prediction bit.
+  //
+  // Consider the trace pair:
+  // {{Spe.From, Spe.To, Type}, {TK, MP}},
+  //   {{PBT.From, PBT.To, TraceTo}, {TK, MP}}
+  // {{0xd456, 0xd789, Trace::BR_ONLY}, {2, 2}}, {{0x0, 0xd123, 0xd456}, {2, 0}}
+  //
+  // The first entry is the Spe record, which represents a trace from 0xd456
+  // (Spe.From) to 0xd789 (Spe.To). Type = Trace::BR_ONLY, as Bolt processes the
+  // current branch event first. At this point we have no information about the
+  // previous trace (PBT). This entry has a TakenCount = 2, as we have two
+  // samples for (0xd456, 0xd789) in our input. It also has MispredsCount = 2,
+  // as 'M' misprediction flag appears in both cases.
+  //
+  // The second entry is the PBT record. TakenCount = 2 because the
+  // (PBT.From = 0x0, PBT.To = 0xd123) branch target appears twice in the input,
+  // and MispredsCount = 0 because prediction data is absent. There is no branch
+  // source information, so the PBT.From field is zero (0x0). TraceTo = 0xd456
+  // connect the flow from the previous taken branch at 0xd123 (PBT.To) to the
+  // current source branch at 0xd456 (Spe.From), which then continues to 0xd789
+  // (Spe.To).
+  std::vector<std::pair<Trace, TakenBranchInfo>> ExpectedSamples = {
+      {{0xa002, 0xa003, Trace::BR_ONLY}, {1, 0}},
+      {{0x0, 0xa001, 0xa002}, {1, 0}},
+      {{0xb002, 0xb003, Trace::BR_ONLY}, {1, 0}},
+      {{0x0, 0xb001, 0xb002}, {1, 0}},
+      {{0xc456, 0xc789, Trace::BR_ONLY}, {2, 0}},
+      {{0x0, 0xc123, 0xc456}, {2, 0}},
+      {{0xd456, 0xd789, Trace::BR_ONLY}, {2, 2}},
+      {{0x0, 0xd123, 0xd456}, {2, 0}},
+      {{0xe005, 0xe009, Trace::BR_ONLY}, {1, 0}},
+      {{0x0, 0xe001, 0xe005}, {1, 0}},
+      {{0xf002, 0xf003, Trace::BR_ONLY}, {1, 1}},
+      {{0x0, 0xf001, 0xf002}, {1, 0}}};
+
+  parseAndCheckBrstackEvents(4567, ExpectedSamples);
+}
+
 #endif
diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
index 611717a..e1856ff 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
@@ -30,6 +30,7 @@
 #include "EasilySwappableParametersCheck.h"
 #include "EmptyCatchCheck.h"
 #include "ExceptionEscapeCheck.h"
+#include "FloatLoopCounterCheck.h"
 #include "FoldInitTypeCheck.h"
 #include "ForwardDeclarationNamespaceCheck.h"
 #include "ForwardingReferenceOverloadCheck.h"
@@ -153,6 +154,8 @@ public:
     CheckFactories.registerCheck<EmptyCatchCheck>("bugprone-empty-catch");
     CheckFactories.registerCheck<ExceptionEscapeCheck>(
         "bugprone-exception-escape");
+    CheckFactories.registerCheck<FloatLoopCounterCheck>(
+        "bugprone-float-loop-counter");
     CheckFactories.registerCheck<FoldInitTypeCheck>("bugprone-fold-init-type");
     CheckFactories.registerCheck<ForwardDeclarationNamespaceCheck>(
         "bugprone-forward-declaration-namespace");
diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
index 6c96996..7d2e108 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
@@ -26,6 +26,7 @@ add_clang_library(clangTidyBugproneModule STATIC
   EasilySwappableParametersCheck.cpp
   EmptyCatchCheck.cpp
   ExceptionEscapeCheck.cpp
+  FloatLoopCounterCheck.cpp
   FoldInitTypeCheck.cpp
   ForwardDeclarationNamespaceCheck.cpp
   ForwardingReferenceOverloadCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.cpp b/clang-tools-extra/clang-tidy/bugprone/FloatLoopCounterCheck.cpp
index 01299e0..adf2d2b 100644
--- a/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/FloatLoopCounterCheck.cpp
@@ -6,16 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "FloatLoopCounter.h"
+#include "FloatLoopCounterCheck.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
 
 using namespace clang::ast_matchers;
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
-void FloatLoopCounter::registerMatchers(MatchFinder *Finder) {
+void FloatLoopCounterCheck::registerMatchers(MatchFinder *Finder) {
   Finder->addMatcher(
       forStmt(hasIncrement(forEachDescendant(
                   declRefExpr(hasType(realFloatingPointType()),
@@ -29,7 +29,7 @@ void FloatLoopCounter::registerMatchers(MatchFinder *Finder) {
       this);
 }
 
-void FloatLoopCounter::check(const MatchFinder::MatchResult &Result) {
+void FloatLoopCounterCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *FS = Result.Nodes.getNodeAs<ForStmt>("for");
 
   diag(FS->getInc()->getBeginLoc(), "loop induction expression should not have "
@@ -43,4 +43,4 @@ void FloatLoopCounter::check(const MatchFinder::MatchResult &Result) {
            DiagnosticIDs::Note);
 }
 
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.h b/clang-tools-extra/clang-tidy/bugprone/FloatLoopCounterCheck.h
index d00c036..43dd9c2 100644
--- a/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.h
+++ b/clang-tools-extra/clang-tidy/bugprone/FloatLoopCounterCheck.h
@@ -6,27 +6,27 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_FLOAT_LOOP_COUNTER_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_FLOAT_LOOP_COUNTER_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_FLOATLOOPCOUNTERCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_FLOATLOOPCOUNTERCHECK_H
 
 #include "../ClangTidyCheck.h"
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
 /// This check diagnoses when the loop induction expression of a for loop has
 /// floating-point type. The check corresponds to:
 /// https://www.securecoding.cert.org/confluence/display/c/FLP30-C.+Do+not+use+floating-point+variables+as+loop+counters
 ///
 /// For the user-facing documentation see:
-/// https://clang.llvm.org/extra/clang-tidy/checks/cert/flp30-c.html
-class FloatLoopCounter : public ClangTidyCheck {
+/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/float-loop-counter.html
+class FloatLoopCounterCheck : public ClangTidyCheck {
 public:
-  FloatLoopCounter(StringRef Name, ClangTidyContext *Context)
+  FloatLoopCounterCheck(StringRef Name, ClangTidyContext *Context)
       : ClangTidyCheck(Name, Context) {}
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
 };
 
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_FLOAT_LOOP_COUNTER_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_FLOATLOOPCOUNTERCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
index 15592d5..a1f6270 100644
--- a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
@@ -12,6 +12,7 @@
 #include "../bugprone/BadSignalToKillThreadCheck.h"
 #include "../bugprone/CommandProcessorCheck.h"
 #include "../bugprone/DefaultOperatorNewOnOveralignedTypeCheck.h"
+#include "../bugprone/FloatLoopCounterCheck.h"
 #include "../bugprone/PointerArithmeticOnPolymorphicObjectCheck.h"
 #include "../bugprone/RawMemoryCallOnNonTrivialTypeCheck.h"
 #include "../bugprone/ReservedIdentifierCheck.h"
@@ -37,7 +38,6 @@
 #include "../performance/MoveConstructorInitCheck.h"
 #include "../readability/EnumInitialValueCheck.h"
 #include "../readability/UppercaseLiteralSuffixCheck.h"
-#include "FloatLoopCounter.h"
 #include "LimitedRandomnessCheck.h"
 #include "MutatingCopyCheck.h"
 #include "ProperlySeededRandomGeneratorCheck.h"
@@ -310,7 +310,8 @@ public:
     CheckFactories.registerCheck<bugprone::SuspiciousMemoryComparisonCheck>(
         "cert-exp42-c");
     // FLP
-    CheckFactories.registerCheck<FloatLoopCounter>("cert-flp30-c");
+    CheckFactories.registerCheck<bugprone::FloatLoopCounterCheck>(
+        "cert-flp30-c");
     CheckFactories.registerCheck<bugprone::SuspiciousMemoryComparisonCheck>(
         "cert-flp37-c");
     // FIO
diff --git a/clang-tools-extra/clang-tidy/cert/CMakeLists.txt b/clang-tools-extra/clang-tidy/cert/CMakeLists.txt
index 27cf392..b25576a3 100644
--- a/clang-tools-extra/clang-tidy/cert/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/cert/CMakeLists.txt
@@ -5,7 +5,6 @@ set(LLVM_LINK_COMPONENTS
 
 add_clang_library(clangTidyCERTModule STATIC
   CERTTidyModule.cpp
-  FloatLoopCounter.cpp
   LimitedRandomnessCheck.cpp
   MutatingCopyCheck.cpp
   ProperlySeededRandomGeneratorCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.cpp b/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.cpp
index a89a896..e7d97b2 100644
--- a/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.cpp
+++ b/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.cpp
@@ -13,17 +13,10 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::hicpp {
 
-namespace {
-AST_MATCHER(VarDecl, isAsm) { return Node.hasAttr<clang::AsmLabelAttr>(); }
-const ast_matchers::internal::VariadicDynCastAllOfMatcher<Decl,
-                                                          FileScopeAsmDecl>
-    fileScopeAsmDecl; // NOLINT(readability-identifier-*) preserve clang style
-} // namespace
-
 void NoAssemblerCheck::registerMatchers(MatchFinder *Finder) {
   Finder->addMatcher(asmStmt().bind("asm-stmt"), this);
   Finder->addMatcher(fileScopeAsmDecl().bind("asm-file-scope"), this);
-  Finder->addMatcher(varDecl(isAsm()).bind("asm-var"), this);
+  Finder->addMatcher(varDecl(hasAttr(attr::AsmLabel)).bind("asm-var"), this);
 }
 
 void NoAssemblerCheck::check(const MatchFinder::MatchResult &Result) {
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 88eb0eb..8ba1d1a 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -269,6 +269,11 @@ New check aliases
   <clang-tidy/checks/bugprone/throwing-static-initialization>`
   keeping initial check as an alias to the new one.
 
+- Renamed :doc:`cert-flp30-c <clang-tidy/checks/cert/flp30-c>` to
+  :doc:`bugprone-float-loop-counter
+  <clang-tidy/checks/bugprone/float-loop-counter>`
+  keeping initial check as an alias to the new one.
+
 - Renamed :doc:`cert-mem57-cpp <clang-tidy/checks/cert/mem57-cpp>` to
   :doc:`bugprone-default-operator-new-on-overaligned-type
   <clang-tidy/checks/bugprone/default-operator-new-on-overaligned-type>`
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/float-loop-counter.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/float-loop-counter.rst
new file mode 100644
index 0000000..e663b40
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/float-loop-counter.rst
@@ -0,0 +1,13 @@
+.. title:: clang-tidy - bugprone-float-loop-counter
+
+bugprone-float-loop-counter
+===========================
+
+Flags ``for`` loops where the induction expression has a floating-point type.
+
+References
+----------
+
+This check corresponds to the CERT C Coding Standard rule
+`FLP30-C. Do not use floating-point variables as loop counters
+<https://www.securecoding.cert.org/confluence/display/c/FLP30-C.+Do+not+use+floating-point+variables+as+loop+counters>`_.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/flp30-c.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/flp30-c.rst
index c37b639..5f6eff44 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cert/flp30-c.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cert/flp30-c.rst
@@ -3,8 +3,9 @@
 cert-flp30-c
 ============
 
-This check flags ``for`` loops where the induction expression has a
-floating-point type.
+The `cert-flp30-c` check is an alias, please see
+`bugprone-float-loop-counter <../bugprone/float-loop-counter.html>`_
+for more information
 
 This check corresponds to the CERT C Coding Standard rule
 `FLP30-C. Do not use floating-point variables as loop counters
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index 5094bcc..be509ab 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -98,6 +98,7 @@ Clang-Tidy Checks
    :doc:`bugprone-easily-swappable-parameters <bugprone/easily-swappable-parameters>`,
    :doc:`bugprone-empty-catch <bugprone/empty-catch>`,
    :doc:`bugprone-exception-escape <bugprone/exception-escape>`,
+   :doc:`bugprone-float-loop-counter <bugprone/float-loop-counter>`,
    :doc:`bugprone-fold-init-type <bugprone/fold-init-type>`,
    :doc:`bugprone-forward-declaration-namespace <bugprone/forward-declaration-namespace>`,
    :doc:`bugprone-forwarding-reference-overload <bugprone/forwarding-reference-overload>`,
@@ -178,7 +179,6 @@ Clang-Tidy Checks
    :doc:`bugprone-virtual-near-miss <bugprone/virtual-near-miss>`, "Yes"
    :doc:`cert-err33-c <cert/err33-c>`,
    :doc:`cert-err60-cpp <cert/err60-cpp>`,
-   :doc:`cert-flp30-c <cert/flp30-c>`,
    :doc:`cert-msc50-cpp <cert/msc50-cpp>`,
    :doc:`cert-msc51-cpp <cert/msc51-cpp>`,
    :doc:`cert-oop58-cpp <cert/oop58-cpp>`,
@@ -451,6 +451,7 @@ Check aliases
    :doc:`cert-err61-cpp <cert/err61-cpp>`, :doc:`misc-throw-by-value-catch-by-reference <misc/throw-by-value-catch-by-reference>`,
    :doc:`cert-exp42-c <cert/exp42-c>`, :doc:`bugprone-suspicious-memory-comparison <bugprone/suspicious-memory-comparison>`,
    :doc:`cert-fio38-c <cert/fio38-c>`, :doc:`misc-non-copyable-objects <misc/non-copyable-objects>`,
+   :doc:`cert-flp30-c <cert/flp30-c>`, :doc:`bugprone-float-loop-counter <bugprone/float-loop-counter>`,
    :doc:`cert-flp37-c <cert/flp37-c>`, :doc:`bugprone-suspicious-memory-comparison <bugprone/suspicious-memory-comparison>`,
    :doc:`cert-int09-c <cert/int09-c>`, :doc:`readability-enum-initial-value <readability/enum-initial-value>`, "Yes"
    :doc:`cert-mem57-cpp <cert/mem57-cpp>`, :doc:`bugprone-default-operator-new-on-overaligned-type <bugprone/default-operator-new-on-overaligned-type>`,
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/flp30-c.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/float-loop-counter.c
index b998588..77812f0 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cert/flp30-c.c
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/float-loop-counter.c
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s cert-flp30-c %t
+// RUN: %check_clang_tidy %s bugprone-float-loop-counter %t
 
 float g(void);
 int c(float);
@@ -7,16 +7,16 @@ float f = 1.0f;
 void match(void) {
 
   for (float x = 0.1f; x <= 1.0f; x += 0.1f) {}
-  // CHECK-MESSAGES: :[[@LINE-1]]:35: warning: loop induction expression should not have floating-point type [cert-flp30-c]
+  // CHECK-MESSAGES: :[[@LINE-1]]:35: warning: loop induction expression should not have floating-point type [bugprone-float-loop-counter]
 
   for (; f > 0; --f) {}
-  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: loop induction expression should not have floating-point type [cert-flp30-c]
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: loop induction expression should not have floating-point type [bugprone-float-loop-counter]
 
   for (float x = 0.0f; c(x); x = g()) {}
-  // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: loop induction expression should not have floating-point type [cert-flp30-c]
+  // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: loop induction expression should not have floating-point type [bugprone-float-loop-counter]
 
   for (int i=0; i < 10 && f < 2.0f; f++, i++) {}
-  // CHECK-MESSAGES: :[[@LINE-1]]:37: warning: loop induction expression should not have floating-point type [cert-flp30-c]
+  // CHECK-MESSAGES: :[[@LINE-1]]:37: warning: loop induction expression should not have floating-point type [bugprone-float-loop-counter]
   // CHECK-MESSAGES: :5:1: note: floating-point type loop induction variable
 }
 
diff --git a/clang/docs/LibASTMatchersReference.html b/clang/docs/LibASTMatchersReference.html
index 5b2a96d..ac1abb4 100644
--- a/clang/docs/LibASTMatchersReference.html
+++ b/clang/docs/LibASTMatchersReference.html
@@ -825,6 +825,20 @@ fieldDecl()
 </pre></td></tr>
 
 
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('fileScopeAsmDecl0')"><a name="fileScopeAsmDecl0Anchor">fileScopeAsmDecl</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1FileScopeAsmDecl.html">FileScopeAsmDecl</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="fileScopeAsmDecl0"><pre>Matches top level asm declarations.
+
+Given
+   __asm("nop");
+   void f() {
+     __asm("mov al, 2");
+   }
+fileScopeAsmDecl()
+  matches '__asm("nop")',
+  but not '__asm("mov al, 2")'.
+</pre></td></tr>
+
+
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Decl.html">Decl</a>&gt;</td><td class="name" onclick="toggle('friendDecl0')"><a name="friendDecl0Anchor">friendDecl</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1FriendDecl.html">FriendDecl</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="friendDecl0"><pre>Matches friend declarations.
 
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index 98e62de..bca2d84 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -2478,6 +2478,21 @@ extern const internal::VariadicDynCastAllOfMatcher<Stmt, NullStmt> nullStmt;
 ///   matches '__asm("mov al, 2")'
 extern const internal::VariadicDynCastAllOfMatcher<Stmt, AsmStmt> asmStmt;
 
+/// Matches top level asm declarations.
+///
+/// Given
+/// \code
+///    __asm("nop");
+///    void f() {
+///      __asm("mov al, 2");
+///    }
+/// \endcode
+/// fileScopeAsmDecl()
+///   matches '__asm("nop")',
+///   but not '__asm("mov al, 2")'.
+extern const internal::VariadicDynCastAllOfMatcher<Decl, FileScopeAsmDecl>
+    fileScopeAsmDecl;
+
 /// Matches bool literals.
 ///
 /// Example matches true
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index d3cca82..40fc66e 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -454,7 +454,7 @@ LANGOPT(BranchTargetEnforcement, 1, 0, NotCompatible, "Branch-target enforcement
 LANGOPT(BranchProtectionPAuthLR, 1, 0, NotCompatible, "Use PC as a diversifier using PAuthLR NOP instructions.")
 LANGOPT(GuardedControlStack, 1, 0, NotCompatible, "Guarded control stack enabled")
 
-LANGOPT(SpeculativeLoadHardening, 1, 0, NotCompatible, "Speculative load hardening enabled")
+LANGOPT(SpeculativeLoadHardening, 1, 0, Benign, "Speculative load hardening enabled")
 
 LANGOPT(RelativeCXXABIVTables, 1, 0, NotCompatible,
         "Use an ABI-incompatible v-table layout that uses relative references")
diff --git a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
index 42f124b..0874b3d 100644
--- a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
+++ b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
@@ -954,6 +954,8 @@ const internal::VariadicDynCastAllOfMatcher<Stmt, CXXTryStmt> cxxTryStmt;
 const internal::VariadicDynCastAllOfMatcher<Stmt, CXXThrowExpr> cxxThrowExpr;
 const internal::VariadicDynCastAllOfMatcher<Stmt, NullStmt> nullStmt;
 const internal::VariadicDynCastAllOfMatcher<Stmt, AsmStmt> asmStmt;
+const internal::VariadicDynCastAllOfMatcher<Decl, FileScopeAsmDecl>
+    fileScopeAsmDecl;
 const internal::VariadicDynCastAllOfMatcher<Stmt, CXXBoolLiteralExpr>
     cxxBoolLiteral;
 const internal::VariadicDynCastAllOfMatcher<Stmt, StringLiteral> stringLiteral;
diff --git a/clang/lib/ASTMatchers/Dynamic/Registry.cpp b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
index 01c03f3..66848f7 100644
--- a/clang/lib/ASTMatchers/Dynamic/Registry.cpp
+++ b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
@@ -246,6 +246,7 @@ RegistryMaps::RegistryMaps() {
   REGISTER_MATCHER(expr);
   REGISTER_MATCHER(exprWithCleanups);
   REGISTER_MATCHER(fieldDecl);
+  REGISTER_MATCHER(fileScopeAsmDecl);
   REGISTER_MATCHER(fixedPointLiteral);
   REGISTER_MATCHER(floatLiteral);
   REGISTER_MATCHER(forCallable);
diff --git a/clang/test/ClangScanDeps/strip-codegen-args.m b/clang/test/ClangScanDeps/strip-codegen-args.m
index 71171f4..f2cec62 100644
--- a/clang/test/ClangScanDeps/strip-codegen-args.m
+++ b/clang/test/ClangScanDeps/strip-codegen-args.m
@@ -16,6 +16,7 @@
 // CHECK-NOT:          "-flto"
 // CHECK-NOT:          "-fno-autolink"
 // CHECK-NOT:          "-mrelax-relocations=no"
+// CHECK-NOT:          "-mspeculative-load-hardening"
 // CHECK:            ]
 // CHECK:            "name": "A"
 // CHECK:          }
@@ -39,6 +40,11 @@
     "command": "clang -Imodules/A -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-modules -O2 -flto=full -fsyntax-only DIR/t3.m",
     "file": "DIR/t2.m"
   }
+  {
+    "directory": "DIR",
+    "command": "clang -Imodules/A -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-modules -O2 -mspeculative-load-hardening -fsyntax-only DIR/t3.m",
+    "file": "DIR/t3.m"
+  }
 ]
 
 //--- modules/A/module.modulemap
diff --git a/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
index 9692d6e..3fcb558 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
@@ -1179,6 +1179,12 @@ TEST_P(ASTMatchersTest, PredefinedExpr) {
                                      has(stringLiteral()))));
 }
 
+TEST_P(ASTMatchersTest, FileScopeAsmDecl) {
+  EXPECT_TRUE(matches("__asm(\"nop\");", fileScopeAsmDecl()));
+  EXPECT_TRUE(
+      notMatches("void f() { __asm(\"mov al, 2\"); }", fileScopeAsmDecl()));
+}
+
 TEST_P(ASTMatchersTest, AsmStatement) {
   EXPECT_TRUE(matches("void foo() { __asm(\"mov al, 2\"); }", asmStmt()));
 }
@@ -2442,7 +2448,8 @@ TEST_P(ASTMatchersTest, LambdaCaptureTest_BindsToCaptureOfReferenceType) {
                       "int main() {"
                       "  int a;"
                       "  f(a);"
-                      "}", matcher));
+                      "}",
+                      matcher));
   EXPECT_FALSE(matches("template <class ...T> void f(T &...args) {"
                        "  [...args = args] () mutable {"
                        "  }();"
@@ -2450,7 +2457,8 @@ TEST_P(ASTMatchersTest, LambdaCaptureTest_BindsToCaptureOfReferenceType) {
                        "int main() {"
                        "  int a;"
                        "  f(a);"
-                       "}", matcher));
+                       "}",
+                       matcher));
 }
 
 TEST_P(ASTMatchersTest, IsDerivedFromRecursion) {
@@ -2628,7 +2636,7 @@ TEST(ASTMatchersTestObjC, ObjCStringLiteral) {
                           "    [Test someFunction:@\"Ola!\"]; "
                           "}\n"
                           "@end ";
-    EXPECT_TRUE(matchesObjC(Objc1String, objcStringLiteral()));
+  EXPECT_TRUE(matchesObjC(Objc1String, objcStringLiteral()));
 }
 
 TEST(ASTMatchersTestObjC, ObjCDecls) {
diff --git a/clang/unittests/Support/TimeProfilerTest.cpp b/clang/unittests/Support/TimeProfilerTest.cpp
index e544c89..3b18aa83 100644
--- a/clang/unittests/Support/TimeProfilerTest.cpp
+++ b/clang/unittests/Support/TimeProfilerTest.cpp
@@ -186,7 +186,8 @@ std::string buildTraceGraph(StringRef Json) {
 
 } // namespace
 
-TEST(TimeProfilerTest, ConstantEvaluationCxx20) {
+// FIXME: Flaky test. See https://github.com/llvm/llvm-project/pull/138613
+TEST(TimeProfilerTest, DISABLED_ConstantEvaluationCxx20) {
   std::string Code = R"(
 void print(double value);
 
diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
index 6312e61..4c0d266 100644
--- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -1122,13 +1122,7 @@ CUDAIntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType,
 void CUDAIntrinsicLibrary::genSyncWarp(
     llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() == 1);
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.bar.warp.sync";
-  mlir::Value mask = fir::getBase(args[0]);
-  mlir::FunctionType funcType =
-      mlir::FunctionType::get(builder.getContext(), {mask.getType()}, {});
-  auto funcOp = builder.createFunction(loc, funcName, funcType);
-  llvm::SmallVector<mlir::Value> argsList{mask};
-  fir::CallOp::create(builder, loc, funcOp, argsList);
+  mlir::NVVM::SyncWarpOp::create(builder, loc, fir::getBase(args[0]));
 }
 
 // THIS_GRID
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 2d2c801..9f8f74a 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -105,7 +105,7 @@ end
 
 ! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
 ! CHECK: nvvm.barrier0
-! CHECK: fir.call @llvm.nvvm.bar.warp.sync(%c1{{.*}}) fastmath<contract> : (i32) -> ()
+! CHECK: nvvm.bar.warp.sync %c1{{.*}} : i32 
 ! CHECK: fir.call @llvm.nvvm.membar.gl() fastmath<contract> : () -> ()
 ! CHECK: fir.call @llvm.nvvm.membar.cta() fastmath<contract> : () -> ()
 ! CHECK: fir.call @llvm.nvvm.membar.sys() fastmath<contract> : () -> ()
@@ -219,7 +219,7 @@ end
 ! CHECK-LABEL: func.func @_QPhost1()
 ! CHECK: cuf.kernel
 ! CHECK: nvvm.barrier0
-! CHECK: fir.call @llvm.nvvm.bar.warp.sync(%c1{{.*}}) fastmath<contract> : (i32) -> ()
+! CHECK: nvvm.bar.warp.sync %c1{{.*}} : i32 
 ! CHECK: fir.call @llvm.nvvm.barrier0.and(%c1{{.*}}) fastmath<contract> : (i32) -> i32
 ! CHECK: fir.call @llvm.nvvm.barrier0.popc(%c1{{.*}}) fastmath<contract> : (i32) -> i32
 ! CHECK: fir.call @llvm.nvvm.barrier0.or(%c1{{.*}}) fastmath<contract> : (i32) -> i32
diff --git a/libc/src/__support/CPP/type_traits/is_destructible.h b/libc/src/__support/CPP/type_traits/is_destructible.h
index 7ada223..dc5e62b 100644
--- a/libc/src/__support/CPP/type_traits/is_destructible.h
+++ b/libc/src/__support/CPP/type_traits/is_destructible.h
@@ -15,6 +15,7 @@
 #include "src/__support/CPP/type_traits/remove_all_extents.h"
 #include "src/__support/CPP/type_traits/true_type.h"
 #include "src/__support/CPP/type_traits/type_identity.h"
+#include "src/__support/CPP/utility/declval.h"
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
 
diff --git a/libc/startup/baremetal/arm/start.cpp b/libc/startup/baremetal/arm/start.cpp
index c089a14..4740067 100644
--- a/libc/startup/baremetal/arm/start.cpp
+++ b/libc/startup/baremetal/arm/start.cpp
@@ -131,6 +131,32 @@ namespace LIBC_NAMESPACE_DECL {
   __arm_wsr("CPSR_c", 0x13); // SVC
 #endif
 
+#ifdef __ARM_FP
+// Enable FPU
+#if __ARM_ARCH_PROFILE == 'M'
+  // Based on
+  // https://developer.arm.com/documentation/dui0646/c/Cortex-M7-Peripherals/Floating-Point-Unit/Enabling-the-FPU
+  // Set CPACR cp10 and cp11
+  auto cpacr = (volatile uint32_t *const)0xE000ED88;
+  *cpacr |= (0xF << 20);
+  __dsb(0xF);
+  __isb(0xF);
+#elif __ARM_ARCH_PROFILE == 'A' || __ARM_ARCH_PROFILE == 'R'
+  // Based on
+  // https://developer.arm.com/documentation/dui0472/m/Compiler-Coding-Practices/Enabling-NEON-and-FPU-for-bare-metal
+  // Set CPACR cp10 and cp11
+  uint32_t cpacr = __arm_rsr("p15:0:c1:c0:2");
+  cpacr |= (0xF << 20);
+  __arm_wsr("p15:0:c1:c0:2", cpacr);
+  __isb(0xF);
+  // Set FPEXC.EN
+  uint32_t fpexc;
+  __asm__ __volatile__("vmrs %0, FPEXC" : "=r"(fpexc) : :);
+  fpexc |= (1 << 30);
+  __asm__ __volatile__("vmsr FPEXC, %0" : : "r"(fpexc) :);
+#endif
+#endif
+
   // Perform the equivalent of scatterloading
   LIBC_NAMESPACE::memcpy(__data_start, __data_source,
                          reinterpret_cast<uintptr_t>(__data_size));
diff --git a/libcxx/utils/ci/buildkite-pipeline.yml b/libcxx/utils/ci/buildkite-pipeline.yml
index ca83af9..2ac69c3 100644
--- a/libcxx/utils/ci/buildkite-pipeline.yml
+++ b/libcxx/utils/ci/buildkite-pipeline.yml
@@ -103,7 +103,6 @@ steps:
       queue: libcxx-builders
       os: aix
     <<: *common
-    skip: "https://github.com/llvm/llvm-project/issues/162516"
 
   - label: AIX (64-bit)
     command: libcxx/utils/ci/run-buildbot aix
@@ -115,7 +114,6 @@ steps:
       queue: libcxx-builders
       os: aix
     <<: *common
-    skip: "https://github.com/llvm/llvm-project/issues/162516"
 
 - group: ':freebsd: FreeBSD'
   steps:
diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst
index 7499613..9ecd390 100644
--- a/llvm/docs/SPIRVUsage.rst
+++ b/llvm/docs/SPIRVUsage.rst
@@ -241,6 +241,8 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e
      - Adds predicated load and store instructions that conditionally read from or write to memory based on a boolean predicate.
    * - ``SPV_KHR_maximal_reconvergence``
      - Adds execution mode and capability to enable maximal reconvergence.
+   * - ``SPV_ALTERA_blocking_pipes``
+     - Adds new pipe read and write functions that have blocking semantics instead of the non-blocking semantics of the existing pipe read/write functions.
 
 SPIR-V representation in LLVM IR
 ================================
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 268025e7..9d6038d 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -297,6 +297,10 @@ private:
   /// \pre \p U is a call instruction.
   bool translateCall(const User &U, MachineIRBuilder &MIRBuilder);
 
+  bool translateIntrinsic(
+      const CallBase &CB, Intrinsic::ID ID, MachineIRBuilder &MIRBuilder,
+      const TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo = nullptr);
+
   /// When an invoke or a cleanupret unwinds to the next EH pad, there are
   /// many places it could ultimately go. In the IR, we have a single unwind
   /// destination, but in the machine CFG, we enumerate all the possible blocks.
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 8aeaa9c..2550c2b 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3233,11 +3233,6 @@ public:
   /// Default to be the minimum interleave factor: 2.
   virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; }
 
-  /// Return true if the target interleave with shuffles are cheaper
-  virtual bool isProfitableToInterleaveWithGatherScatter() const {
-    return false;
-  }
-
   /// Lower an interleaved load to target specific intrinsics. Return
   /// true on success.
   ///
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h
index 7cc78d4..fc41641 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h
@@ -211,6 +211,21 @@ public:
     return FilteredView(Libraries.begin(), Libraries.end(), S, K);
   }
 
+  using LibraryFilterFn = std::function<bool(const LibraryInfo &)>;
+  void getLibraries(LibState S, PathType K,
+                    std::vector<std::shared_ptr<LibraryInfo>> &Outs,
+                    LibraryFilterFn Filter = nullptr) const {
+    std::shared_lock<std::shared_mutex> Lock(Mtx);
+    for (const auto &[_, Entry] : Libraries) {
+      const auto &Info = *Entry;
+      if (Info.getKind() != K || Info.getState() != S)
+        continue;
+      if (Filter && !Filter(Info))
+        continue;
+      Outs.push_back(Entry);
+    }
+  }
+
   void forEachLibrary(const LibraryVisitor &visitor) const {
     std::unique_lock<std::shared_mutex> Lock(Mtx);
     for (const auto &[_, entry] : Libraries) {
@@ -220,14 +235,14 @@ public:
   }
 
   bool isLoaded(StringRef Path) const {
-    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    std::shared_lock<std::shared_mutex> Lock(Mtx);
     if (auto It = Libraries.find(Path.str()); It != Libraries.end())
       return It->second->getState() == LibState::Loaded;
     return false;
   }
 
   bool isQueried(StringRef Path) const {
-    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    std::shared_lock<std::shared_mutex> Lock(Mtx);
     if (auto It = Libraries.find(Path.str()); It != Libraries.end())
       return It->second->getState() == LibState::Queried;
     return false;
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 4fd2204..be1b51f 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2821,20 +2821,34 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   if (translateKnownIntrinsic(CI, ID, MIRBuilder))
     return true;
 
+  TargetLowering::IntrinsicInfo Info;
+  bool IsTgtMemIntrinsic = TLI->getTgtMemIntrinsic(Info, CI, *MF, ID);
+
+  return translateIntrinsic(CI, ID, MIRBuilder,
+                            IsTgtMemIntrinsic ? &Info : nullptr);
+}
+
+/// Translate a call to an intrinsic.
+/// Depending on whether TLI->getTgtMemIntrinsic() is true, TgtMemIntrinsicInfo
+/// is a pointer to the correspondingly populated IntrinsicInfo object.
+/// Otherwise, this pointer is null.
+bool IRTranslator::translateIntrinsic(
+    const CallBase &CB, Intrinsic::ID ID, MachineIRBuilder &MIRBuilder,
+    const TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo) {
   ArrayRef<Register> ResultRegs;
-  if (!CI.getType()->isVoidTy())
-    ResultRegs = getOrCreateVRegs(CI);
+  if (!CB.getType()->isVoidTy())
+    ResultRegs = getOrCreateVRegs(CB);
 
   // Ignore the callsite attributes. Backend code is most likely not expecting
   // an intrinsic to sometimes have side effects and sometimes not.
   MachineInstrBuilder MIB = MIRBuilder.buildIntrinsic(ID, ResultRegs);
-  if (isa<FPMathOperator>(CI))
-    MIB->copyIRFlags(CI);
+  if (isa<FPMathOperator>(CB))
+    MIB->copyIRFlags(CB);
 
-  for (const auto &Arg : enumerate(CI.args())) {
+  for (const auto &Arg : enumerate(CB.args())) {
     // If this is required to be an immediate, don't materialize it in a
     // register.
-    if (CI.paramHasAttr(Arg.index(), Attribute::ImmArg)) {
+    if (CB.paramHasAttr(Arg.index(), Attribute::ImmArg)) {
       if (ConstantInt *CI = dyn_cast<ConstantInt>(Arg.value())) {
         // imm arguments are more convenient than cimm (and realistically
         // probably sufficient), so use them.
@@ -2863,29 +2877,33 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   }
 
   // Add a MachineMemOperand if it is a target mem intrinsic.
-  TargetLowering::IntrinsicInfo Info;
-  // TODO: Add a GlobalISel version of getTgtMemIntrinsic.
-  if (TLI->getTgtMemIntrinsic(Info, CI, *MF, ID)) {
-    Align Alignment = Info.align.value_or(
-        DL->getABITypeAlign(Info.memVT.getTypeForEVT(F->getContext())));
-    LLT MemTy = Info.memVT.isSimple()
-                    ? getLLTForMVT(Info.memVT.getSimpleVT())
-                    : LLT::scalar(Info.memVT.getStoreSizeInBits());
+  if (TgtMemIntrinsicInfo) {
+    const Function *F = CB.getCalledFunction();
+
+    Align Alignment = TgtMemIntrinsicInfo->align.value_or(DL->getABITypeAlign(
+        TgtMemIntrinsicInfo->memVT.getTypeForEVT(F->getContext())));
+    LLT MemTy =
+        TgtMemIntrinsicInfo->memVT.isSimple()
+            ? getLLTForMVT(TgtMemIntrinsicInfo->memVT.getSimpleVT())
+            : LLT::scalar(TgtMemIntrinsicInfo->memVT.getStoreSizeInBits());
 
     // TODO: We currently just fallback to address space 0 if getTgtMemIntrinsic
     //       didn't yield anything useful.
     MachinePointerInfo MPI;
-    if (Info.ptrVal)
-      MPI = MachinePointerInfo(Info.ptrVal, Info.offset);
-    else if (Info.fallbackAddressSpace)
-      MPI = MachinePointerInfo(*Info.fallbackAddressSpace);
+    if (TgtMemIntrinsicInfo->ptrVal) {
+      MPI = MachinePointerInfo(TgtMemIntrinsicInfo->ptrVal,
+                               TgtMemIntrinsicInfo->offset);
+    } else if (TgtMemIntrinsicInfo->fallbackAddressSpace) {
+      MPI = MachinePointerInfo(*TgtMemIntrinsicInfo->fallbackAddressSpace);
+    }
     MIB.addMemOperand(MF->getMachineMemOperand(
-        MPI, Info.flags, MemTy, Alignment, CI.getAAMetadata(),
-        /*Ranges=*/nullptr, Info.ssid, Info.order, Info.failureOrder));
+        MPI, TgtMemIntrinsicInfo->flags, MemTy, Alignment, CB.getAAMetadata(),
+        /*Ranges=*/nullptr, TgtMemIntrinsicInfo->ssid,
+        TgtMemIntrinsicInfo->order, TgtMemIntrinsicInfo->failureOrder));
   }
 
-  if (CI.isConvergent()) {
-    if (auto Bundle = CI.getOperandBundle(LLVMContext::OB_convergencectrl)) {
+  if (CB.isConvergent()) {
+    if (auto Bundle = CB.getOperandBundle(LLVMContext::OB_convergencectrl)) {
       auto *Token = Bundle->Inputs[0].get();
       Register TokenReg = getOrCreateVReg(*Token);
       MIB.addUse(TokenReg, RegState::Implicit);
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 45eca28..5c27a20 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -239,8 +239,7 @@ static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
 /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
 /// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
 static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
-                               unsigned MaxFactor,
-                               bool InterleaveWithShuffles) {
+                               unsigned MaxFactor) {
   unsigned NumElts = SVI->getShuffleMask().size();
   if (NumElts < 4)
     return false;
@@ -251,13 +250,6 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
       return true;
   }
 
-  if (InterleaveWithShuffles) {
-    for (unsigned i = 1; MaxFactor * i <= 16; i *= 2) {
-      Factor = i * MaxFactor;
-      if (SVI->isInterleave(Factor))
-        return true;
-    }
-  }
   return false;
 }
 
@@ -536,8 +528,7 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
       cast<FixedVectorType>(SVI->getType())->getNumElements();
   // Check if the shufflevector is RE-interleave shuffle.
   unsigned Factor;
-  if (!isReInterleaveMask(SVI, Factor, MaxFactor,
-                          TLI->isProfitableToInterleaveWithGatherScatter()))
+  if (!isReInterleaveMask(SVI, Factor, MaxFactor))
     return false;
   assert(NumStoredElements % Factor == 0 &&
          "number of stored element should be a multiple of Factor");
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 46c4bb8..816b7ba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4046,6 +4046,8 @@ static SDValue foldSubCtlzNot(SDNode *N, SelectionDAG &DAG) {
                                     m_ConstInt(AndMask)))) {
     // Type Legalisation Pattern:
     // (sub (ctlz (and (xor Op XorMask) AndMask)) BitWidthDiff)
+    if (BitWidthDiff.getZExtValue() >= BitWidth)
+      return SDValue();
     unsigned AndMaskWidth = BitWidth - BitWidthDiff.getZExtValue();
     if (!(AndMask.isMask(AndMaskWidth) && XorMask.countr_one() >= AndMaskWidth))
       return SDValue();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index fa0c899..9961c98 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3526,8 +3526,7 @@ void SelectionDAGBuilder::visitCallBr(const CallBrInst &I) {
 
   // Update successor info.
   addSuccessorWithProb(CallBrMBB, Return, BranchProbability::getOne());
-  for (unsigned i = 0, e = I.getNumIndirectDests(); i < e; ++i) {
-    BasicBlock *Dest = I.getIndirectDest(i);
+  for (BasicBlock *Dest : I.getIndirectDests()) {
     MachineBasicBlock *Target = FuncInfo.getMBB(Dest);
     Target->setIsInlineAsmBrIndirectTarget();
     // If we introduce a type of asm goto statement that is permitted to use an
@@ -5313,18 +5312,26 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
   DAG.setRoot(OutChain);
 }
 
-/// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC
-/// node.
-void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
-                                               unsigned Intrinsic) {
-  // Ignore the callsite's attributes. A specific call site may be marked with
-  // readnone, but the lowering code will expect the chain based on the
-  // definition.
+/// Check if this intrinsic call depends on the chain (1st return value)
+/// and if it only *loads* memory.
+/// Ignore the callsite's attributes. A specific call site may be marked with
+/// readnone, but the lowering code will expect the chain based on the
+/// definition.
+std::pair<bool, bool>
+SelectionDAGBuilder::getTargetIntrinsicCallProperties(const CallBase &I) {
   const Function *F = I.getCalledFunction();
   bool HasChain = !F->doesNotAccessMemory();
   bool OnlyLoad =
       HasChain && F->onlyReadsMemory() && F->willReturn() && F->doesNotThrow();
 
+  return {HasChain, OnlyLoad};
+}
+
+SmallVector<SDValue, 8> SelectionDAGBuilder::getTargetIntrinsicOperands(
+    const CallBase &I, bool HasChain, bool OnlyLoad,
+    TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
   // Build the operand list.
   SmallVector<SDValue, 8> Ops;
   if (HasChain) {  // If this intrinsic has side-effects, chainify it.
@@ -5336,17 +5343,10 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
     }
   }
 
-  // Info is set by getTgtMemIntrinsic
-  TargetLowering::IntrinsicInfo Info;
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  bool IsTgtIntrinsic = TLI.getTgtMemIntrinsic(Info, I,
-                                               DAG.getMachineFunction(),
-                                               Intrinsic);
-
   // Add the intrinsic ID as an integer operand if it's not a target intrinsic.
-  if (!IsTgtIntrinsic || Info.opc == ISD::INTRINSIC_VOID ||
-      Info.opc == ISD::INTRINSIC_W_CHAIN)
-    Ops.push_back(DAG.getTargetConstant(Intrinsic, getCurSDLoc(),
+  if (!TgtMemIntrinsicInfo || TgtMemIntrinsicInfo->opc == ISD::INTRINSIC_VOID ||
+      TgtMemIntrinsicInfo->opc == ISD::INTRINSIC_W_CHAIN)
+    Ops.push_back(DAG.getTargetConstant(I.getIntrinsicID(), getCurSDLoc(),
                                         TLI.getPointerTy(DAG.getDataLayout())));
 
   // Add all operands of the call to the operand list.
@@ -5369,13 +5369,85 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
     }
   }
 
+  if (std::optional<OperandBundleUse> Bundle =
+          I.getOperandBundle(LLVMContext::OB_convergencectrl)) {
+    Value *Token = Bundle->Inputs[0].get();
+    SDValue ConvControlToken = getValue(Token);
+    assert(Ops.back().getValueType() != MVT::Glue &&
+           "Did not expect another glue node here.");
+    ConvControlToken =
+        DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {}, MVT::Glue, ConvControlToken);
+    Ops.push_back(ConvControlToken);
+  }
+
+  return Ops;
+}
+
+SDVTList SelectionDAGBuilder::getTargetIntrinsicVTList(const CallBase &I,
+                                                       bool HasChain) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
   SmallVector<EVT, 4> ValueVTs;
   ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs);
 
   if (HasChain)
     ValueVTs.push_back(MVT::Other);
 
-  SDVTList VTs = DAG.getVTList(ValueVTs);
+  return DAG.getVTList(ValueVTs);
+}
+
+/// Get an INTRINSIC node for a target intrinsic which does not touch memory.
+SDValue SelectionDAGBuilder::getTargetNonMemIntrinsicNode(
+    const Type &IntrinsicVT, bool HasChain, ArrayRef<SDValue> Ops,
+    const SDVTList &VTs) {
+  if (!HasChain)
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
+  if (!IntrinsicVT.isVoidTy())
+    return DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(), VTs, Ops);
+  return DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops);
+}
+
+/// Set root, convert return type if necessary and check alignment.
+SDValue SelectionDAGBuilder::handleTargetIntrinsicRet(const CallBase &I,
+                                                      bool HasChain,
+                                                      bool OnlyLoad,
+                                                      SDValue Result) {
+  if (HasChain) {
+    SDValue Chain = Result.getValue(Result.getNode()->getNumValues() - 1);
+    if (OnlyLoad)
+      PendingLoads.push_back(Chain);
+    else
+      DAG.setRoot(Chain);
+  }
+
+  if (I.getType()->isVoidTy())
+    return Result;
+
+  if (MaybeAlign Alignment = I.getRetAlign(); InsertAssertAlign && Alignment) {
+    // Insert `assertalign` node if there's an alignment.
+    Result = DAG.getAssertAlign(getCurSDLoc(), Result, Alignment.valueOrOne());
+  } else if (!isa<VectorType>(I.getType())) {
+    Result = lowerRangeToAssertZExt(DAG, I, Result);
+  }
+
+  return Result;
+}
+
+/// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC
+/// node.
+void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
+                                               unsigned Intrinsic) {
+  auto [HasChain, OnlyLoad] = getTargetIntrinsicCallProperties(I);
+
+  // Info is set by getTgtMemIntrinsic
+  TargetLowering::IntrinsicInfo Info;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  bool IsTgtMemIntrinsic =
+      TLI.getTgtMemIntrinsic(Info, I, DAG.getMachineFunction(), Intrinsic);
+
+  SmallVector<SDValue, 8> Ops = getTargetIntrinsicOperands(
+      I, HasChain, OnlyLoad, IsTgtMemIntrinsic ? &Info : nullptr);
+  SDVTList VTs = getTargetIntrinsicVTList(I, HasChain);
 
   // Propagate fast-math-flags from IR to node(s).
   SDNodeFlags Flags;
@@ -5386,19 +5458,9 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
   // Create the node.
   SDValue Result;
 
-  if (auto Bundle = I.getOperandBundle(LLVMContext::OB_convergencectrl)) {
-    auto *Token = Bundle->Inputs[0].get();
-    SDValue ConvControlToken = getValue(Token);
-    assert(Ops.back().getValueType() != MVT::Glue &&
-           "Did not expected another glue node here.");
-    ConvControlToken =
-        DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {}, MVT::Glue, ConvControlToken);
-    Ops.push_back(ConvControlToken);
-  }
-
   // In some cases, custom collection of operands from CallInst I may be needed.
   TLI.CollectTargetIntrinsicOperands(I, Ops, DAG);
-  if (IsTgtIntrinsic) {
+  if (IsTgtMemIntrinsic) {
     // This is target intrinsic that touches memory
     //
     // TODO: We currently just fallback to address space 0 if getTgtMemIntrinsic
@@ -5418,34 +5480,11 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
         Info.ssid, Info.order, Info.failureOrder);
     Result =
         DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs, Ops, MemVT, MMO);
-  } else if (!HasChain) {
-    Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
-  } else if (!I.getType()->isVoidTy()) {
-    Result = DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(), VTs, Ops);
   } else {
-    Result = DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops);
+    Result = getTargetNonMemIntrinsicNode(*I.getType(), HasChain, Ops, VTs);
   }
 
-  if (HasChain) {
-    SDValue Chain = Result.getValue(Result.getNode()->getNumValues()-1);
-    if (OnlyLoad)
-      PendingLoads.push_back(Chain);
-    else
-      DAG.setRoot(Chain);
-  }
-
-  if (!I.getType()->isVoidTy()) {
-    if (!isa<VectorType>(I.getType()))
-      Result = lowerRangeToAssertZExt(DAG, I, Result);
-
-    MaybeAlign Alignment = I.getRetAlign();
-
-    // Insert `assertalign` node if there's an alignment.
-    if (InsertAssertAlign && Alignment) {
-      Result =
-          DAG.getAssertAlign(getCurSDLoc(), Result, Alignment.valueOrOne());
-    }
-  }
+  Result = handleTargetIntrinsicRet(I, HasChain, OnlyLoad, Result);
 
   setValue(&I, Result);
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 47e19f7..ed63bee 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -727,6 +727,17 @@ private:
                        MCSymbol *&BeginLabel);
   SDValue lowerEndEH(SDValue Chain, const InvokeInst *II,
                      const BasicBlock *EHPadBB, MCSymbol *BeginLabel);
+
+  std::pair<bool, bool> getTargetIntrinsicCallProperties(const CallBase &I);
+  SmallVector<SDValue, 8> getTargetIntrinsicOperands(
+      const CallBase &I, bool HasChain, bool OnlyLoad,
+      TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo = nullptr);
+  SDVTList getTargetIntrinsicVTList(const CallBase &I, bool HasChain);
+  SDValue getTargetNonMemIntrinsicNode(const Type &IntrinsicVT, bool HasChain,
+                                       ArrayRef<SDValue> Ops,
+                                       const SDVTList &VTs);
+  SDValue handleTargetIntrinsicRet(const CallBase &I, bool HasChain,
+                                   bool OnlyLoad, SDValue Result);
 };
 
 /// This struct represents the registers (physical or virtual)
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp
index 35da82a..7e1d528 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp
@@ -184,9 +184,9 @@ class SymbolSearchContext {
 public:
   SymbolSearchContext(SymbolQuery &Q) : Q(Q) {}
 
-  bool hasSearched(LibraryInfo *Lib) const { return Searched.count(Lib); }
+  bool hasSearched(const LibraryInfo *Lib) const { return Searched.count(Lib); }
 
-  void markSearched(LibraryInfo *Lib) { Searched.insert(Lib); }
+  void markSearched(const LibraryInfo *Lib) { Searched.insert(Lib); }
 
   inline bool allResolved() const { return Q.allResolved(); }
 
@@ -194,7 +194,7 @@ public:
 
 private:
   SymbolQuery &Q;
-  DenseSet<LibraryInfo *> Searched;
+  DenseSet<const LibraryInfo *> Searched;
 };
 
 void LibraryResolver::resolveSymbolsInLibrary(
@@ -226,19 +226,18 @@ void LibraryResolver::resolveSymbolsInLibrary(
           return EnumerateResult::Continue;
         },
         Opts);
+  };
 
+  if (!Lib.hasFilter()) {
+    LLVM_DEBUG(dbgs() << "Building filter for library: " << Lib.getFullPath()
+                      << "\n";);
+    enumerateSymbolsIfNeeded();
     if (DiscoveredSymbols.empty()) {
       LLVM_DEBUG(dbgs() << "  No symbols and remove library : "
                         << Lib.getFullPath() << "\n";);
       LibMgr.removeLibrary(Lib.getFullPath());
       return;
     }
-  };
-
-  if (!Lib.hasFilter()) {
-    LLVM_DEBUG(dbgs() << "Building filter for library: " << Lib.getFullPath()
-                      << "\n";);
-    enumerateSymbolsIfNeeded();
     SmallVector<StringRef> SymbolVec;
     SymbolVec.reserve(DiscoveredSymbols.size());
     for (const auto &KV : DiscoveredSymbols)
@@ -288,11 +287,15 @@ void LibraryResolver::searchSymbolsInLibraries(
 
     SymbolSearchContext Ctx(Q);
     while (!Ctx.allResolved()) {
+      std::vector<std::shared_ptr<LibraryInfo>> Libs;
+      LibMgr.getLibraries(S, K, Libs, [&](const LibraryInfo &Lib) {
+        return !Ctx.hasSearched(&Lib);
+      });
 
-      for (auto &Lib : LibMgr.getView(S, K)) {
-        if (Ctx.hasSearched(Lib.get()))
-          continue;
+      if (Libs.empty() && !scanLibrariesIfNeeded(K, scanBatchSize))
+        break; // no more new libs to scan
 
+      for (auto &Lib : Libs) {
         // can use Async here?
         resolveSymbolsInLibrary(*Lib, Ctx.query(), Config.Options);
         Ctx.markSearched(Lib.get());
@@ -300,12 +303,6 @@ void LibraryResolver::searchSymbolsInLibraries(
         if (Ctx.allResolved())
           return;
       }
-
-      if (Ctx.allResolved())
-        return;
-
-      if (!scanLibrariesIfNeeded(K, scanBatchSize))
-        break; // no more new libs to scan
     }
   };
 
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp
index d93f686..32f6dbe 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp
@@ -50,7 +50,7 @@ void handleError(Error Err, StringRef context = "") {
 }
 
 bool ObjectFileLoader::isArchitectureCompatible(const object::ObjectFile &Obj) {
-  Triple HostTriple(sys::getDefaultTargetTriple());
+  Triple HostTriple(sys::getProcessTriple());
   Triple ObjTriple = Obj.makeTriple();
 
   LLVM_DEBUG({
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2987468..d08f9b9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -96,7 +96,6 @@
 #include <cctype>
 #include <cstdint>
 #include <cstdlib>
-#include <deque>
 #include <iterator>
 #include <limits>
 #include <optional>
@@ -17990,17 +17989,11 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
                                                   unsigned Factor,
                                                   const APInt &GapMask) const {
 
+  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+         "Invalid interleave factor");
   auto *SI = dyn_cast<StoreInst>(Store);
   if (!SI)
     return false;
-
-  if (isProfitableToInterleaveWithGatherScatter() &&
-      Factor > getMaxSupportedInterleaveFactor())
-    return lowerInterleavedStoreWithShuffle(SI, SVI, Factor);
-
-  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
-         "Invalid interleave factor");
-
   assert(!LaneMask && GapMask.popcount() == Factor &&
          "Unexpected mask on store");
 
@@ -18146,126 +18139,6 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
   return true;
 }
 
-/// If the interleaved vector elements are greater than supported MaxFactor,
-/// interleaving the data with additional shuffles can be used to
-/// achieve the same.
-///
-/// Consider the following data with 8 interleaves which are shuffled to store
-/// stN instructions. Data needs to be stored in this order:
-///     [v0, v1, v2, v3, v4, v5, v6, v7]
-///
-///    v0      v4      v2      v6      v1      v5      v3      v7
-///    |       |       |       |       |       |       |       |
-///     \     /         \     /         \     /         \     /
-///   [zip v0,v4]      [zip v2,v6]    [zip v1,v5]      [zip v3,v7] ==> stN = 4
-///        |               |              |                 |
-///         \             /                \               /
-///          \           /                  \             /
-///           \         /                    \           /
-///       [zip [v0,v2,v4,v6]]            [zip [v1,v3,v5,v7]]     ==> stN = 2
-///
-/// For stN = 4, upper half of interleaved data V0, V1, V2, V3 is stored
-/// with one st4 instruction. Lower half, i.e, V4, V5, V6, V7 is stored with
-/// another st4.
-///
-/// For stN = 2, upper half of interleaved data V0, V1 is stored
-/// with one st2 instruction. Second set V2, V3 is stored with another st2.
-/// Total of 4 st2's are required here.
-bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle(
-    StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const {
-  unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor();
-
-  auto *VecTy = cast<FixedVectorType>(SVI->getType());
-  assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
-
-  unsigned LaneLen = VecTy->getNumElements() / Factor;
-  Type *EltTy = VecTy->getElementType();
-  auto *SubVecTy = FixedVectorType::get(EltTy, Factor);
-
-  const DataLayout &DL = SI->getModule()->getDataLayout();
-  bool UseScalable;
-
-  // Skip if we do not have NEON and skip illegal vector types. We can
-  // "legalize" wide vector types into multiple interleaved accesses as long as
-  // the vector types are divisible by 128.
-  if (!Subtarget->hasNEON() ||
-      !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
-    return false;
-
-  if (UseScalable)
-    return false;
-
-  std::deque<Value *> Shuffles;
-  Shuffles.push_back(SVI);
-  unsigned ConcatLevel = Factor;
-  // Getting all the interleaved operands.
-  while (ConcatLevel > 1) {
-    unsigned InterleavedOperands = Shuffles.size();
-    for (unsigned i = 0; i < InterleavedOperands; i++) {
-      ShuffleVectorInst *SFL = dyn_cast<ShuffleVectorInst>(Shuffles.front());
-      if (!SFL)
-        return false;
-      Shuffles.pop_front();
-
-      Value *Op0 = SFL->getOperand(0);
-      Value *Op1 = SFL->getOperand(1);
-
-      Shuffles.push_back(dyn_cast<Value>(Op0));
-      Shuffles.push_back(dyn_cast<Value>(Op1));
-    }
-    ConcatLevel >>= 1;
-  }
-
-  IRBuilder<> Builder(SI);
-  auto Mask = createInterleaveMask(LaneLen, 2);
-  SmallVector<int, 16> UpperHalfMask(LaneLen), LowerHalfMask(LaneLen);
-  for (unsigned i = 0; i < LaneLen; i++) {
-    LowerHalfMask[i] = Mask[i];
-    UpperHalfMask[i] = Mask[i + LaneLen];
-  }
-
-  unsigned InterleaveFactor = Factor >> 1;
-  while (InterleaveFactor >= MaxSupportedFactor) {
-    std::deque<Value *> ShufflesIntermediate;
-    ShufflesIntermediate.resize(Factor);
-    for (unsigned j = 0; j < Factor; j += (InterleaveFactor * 2)) {
-      for (unsigned i = 0; i < InterleaveFactor; i++) {
-        auto *Shuffle = Builder.CreateShuffleVector(
-            Shuffles[i + j], Shuffles[i + j + InterleaveFactor], LowerHalfMask);
-        ShufflesIntermediate[i + j] = Shuffle;
-        Shuffle = Builder.CreateShuffleVector(
-            Shuffles[i + j], Shuffles[i + j + InterleaveFactor], UpperHalfMask);
-        ShufflesIntermediate[i + j + InterleaveFactor] = Shuffle;
-      }
-    }
-    Shuffles = ShufflesIntermediate;
-    InterleaveFactor >>= 1;
-  }
-
-  Type *PtrTy = SI->getPointerOperandType();
-  auto *STVTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
-
-  Value *BaseAddr = SI->getPointerOperand();
-  Function *StNFunc = getStructuredStoreFunction(
-      SI->getModule(), MaxSupportedFactor, UseScalable, STVTy, PtrTy);
-  for (unsigned i = 0; i < (Factor / MaxSupportedFactor); i++) {
-    SmallVector<Value *, 5> Ops;
-    for (unsigned j = 0; j < MaxSupportedFactor; j++)
-      Ops.push_back(Shuffles[i * MaxSupportedFactor + j]);
-
-    if (i > 0) {
-      // We will compute the pointer operand of each store from the original
-      // base address using GEPs. Cast the base address to a pointer to the
-      // scalar  element type.
-      BaseAddr = Builder.CreateConstGEP1_32(
-          SubVecTy->getElementType(), BaseAddr, LaneLen * MaxSupportedFactor);
-    }
-    Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
-    Builder.CreateCall(StNFunc, Ops);
-  }
-  return true;
-}
-
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
   const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index bfd8474..70bfae7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -229,10 +229,6 @@ public:
 
   bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override;
 
-  bool isProfitableToInterleaveWithGatherScatter() const override {
-    return true;
-  }
-
   unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
   bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
@@ -243,9 +239,6 @@ public:
                              ShuffleVectorInst *SVI, unsigned Factor,
                              const APInt &GapMask) const override;
 
-  bool lowerInterleavedStoreWithShuffle(StoreInst *SI, ShuffleVectorInst *SVI,
-                                        unsigned Factor) const;
-
   bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
                                         IntrinsicInst *DI) const override;
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 8729ed3..197aae6 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4922,36 +4922,11 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
   if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
     return InstructionCost::getInvalid();
 
-  unsigned NumLoadStores = 1;
-  InstructionCost ShuffleCost = 0;
-  bool isInterleaveWithShuffle = false;
-  unsigned MaxSupportedFactor = TLI->getMaxSupportedInterleaveFactor();
-
-  auto *SubVecTy =
-      VectorType::get(VecVTy->getElementType(),
-                      VecVTy->getElementCount().divideCoefficientBy(Factor));
-
-  if (TLI->isProfitableToInterleaveWithGatherScatter() &&
-      Opcode == Instruction::Store && (0 == Factor % MaxSupportedFactor) &&
-      Factor > MaxSupportedFactor) {
-    isInterleaveWithShuffle = true;
-    SmallVector<int, 16> Mask;
-    // preparing interleave Mask.
-    for (unsigned i = 0; i < VecVTy->getElementCount().getKnownMinValue() / 2;
-         i++) {
-      for (unsigned j = 0; j < 2; j++)
-        Mask.push_back(j * Factor + i);
-    }
-
-    NumLoadStores = Factor / MaxSupportedFactor;
-    ShuffleCost =
-        (Factor * getShuffleCost(TargetTransformInfo::SK_Splice, VecVTy, VecVTy,
-                                 Mask, CostKind, 0, SubVecTy));
-  }
-
-  if (!UseMaskForGaps &&
-      (Factor <= MaxSupportedFactor || isInterleaveWithShuffle)) {
+  if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
     unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
+    auto *SubVecTy =
+        VectorType::get(VecVTy->getElementType(),
+                        VecVTy->getElementCount().divideCoefficientBy(Factor));
 
     // ldN/stN only support legal vector types of size 64 or 128 in bits.
     // Accesses having vector types that are a multiple of 128 bits can be
@@ -4959,10 +4934,7 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
     bool UseScalable;
     if (MinElts % Factor == 0 &&
         TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
-      return (Factor *
-              TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable) *
-              NumLoadStores) +
-             ShuffleCost;
+      return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index e3b0a1b..e62fdb6 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -312,7 +312,7 @@ public:
   }
 
   bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const {
-    if (!ST->hasSVE())
+    if (!ST->isSVEorStreamingSVEAvailable())
       return false;
 
     // For fixed vectors, avoid scalarization if using SVE for them.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 4fe194c..54d94b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2366,18 +2366,6 @@ def isGFX8GFX9NotGFX90A :
             " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">,
   AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding, (not FeatureGFX90AInsts))>;
 
-// Pre-90A GFX9s allow the NV bit in FLAT instructions.
-def isNVAllowedInFlat :
-  Predicate<"!Subtarget->hasGFX90AInsts() &&"
-            " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">,
-  AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureGFX90AInsts), (not FeatureGFX10Insts))>;
-
-// GFX8 or GFX90A+ do not allow the NV bit in FLAT instructions.
-def isNVNotAllowedInFlat :
-  Predicate<"(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) ||"
-            " ((Subtarget->getGeneration() == AMDGPUSubtarget::GFX9) && Subtarget->hasGFX90AInsts())">,
-  AssemblerPredicate <(any_of FeatureVolcanicIslands, FeatureGFX90AInsts)>;
-
 def isGFX90AOnly :
   Predicate<"Subtarget->hasGFX90AInsts() && !Subtarget->hasGFX940Insts()">,
   AssemblerPredicate<(all_of FeatureGFX90AInsts, (not FeatureGFX940Insts))>;
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 2808c44..09338c5 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1602,11 +1602,6 @@ public:
 
   bool hasKernargPreload() const { return AMDGPU::hasKernargPreload(getSTI()); }
 
-  bool isFlatInstAndNVAllowed(const MCInst &Inst) const {
-    uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
-    return (TSFlags & SIInstrFlags::FLAT) && isGFX9() && !isGFX90A();
-  }
-
   AMDGPUTargetStreamer &getTargetStreamer() {
     MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
     return static_cast<AMDGPUTargetStreamer &>(TS);
@@ -5375,7 +5370,7 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
       S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]);
       Error(S, "scale_offset is not supported on this GPU");
     }
-    if ((CPol & CPol::NV) && !isFlatInstAndNVAllowed(Inst)) {
+    if (CPol & CPol::NV) {
       SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
       StringRef CStr(S.getPointer());
       S = SMLoc::getFromPointer(&CStr.data()[CStr.find("nv")]);
@@ -7150,13 +7145,6 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
   unsigned Enabled = 0, Seen = 0;
   for (;;) {
     SMLoc S = getLoc();
-
-    if (isGFX9() && trySkipId("nv")) {
-      Enabled |= CPol::NV;
-      Seen |= CPol::NV;
-      continue;
-    }
-
     bool Disabling;
     unsigned CPol = getCPolKind(getId(), Mnemo, Disabling);
     if (!CPol)
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 6ef2241..8ea64d1 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -125,7 +125,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
   bits<7> saddr;
   bits<10> vdst;
 
-  bits<6> cpol;
+  bits<5> cpol;
 
   // Only valid on gfx9
   bits<1> lds = ps.lds; // LDS DMA for global and scratch
@@ -2693,52 +2693,29 @@ class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> :
                   !subst("$sccb", !if(has_sccb, "$sccb",""), ps.AsmOperands);
 }
 
-class FLAT_Real_vi_ex_gfx9 <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> :
-  FLAT_Real_vi <op, ps, has_sccb> {
-  let AssemblerPredicate = isNVNotAllowedInFlat;
-}
-
-class FLAT_Real_gfx9 <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> :
-  FLAT_Real_vi <op, ps, has_sccb> {
-  let AssemblerPredicate = isNVAllowedInFlat;
-  let Subtarget = SIEncodingFamily.GFX9;
-  let DecoderNamespace = "GFX9";
-  let Inst{55} = cpol{CPolBit.NV}; // nv - GFX9 (pre-90A) uses bit 55 as the non-volatile bit.
-}
-
-multiclass FLAT_Real_mc_vi <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> {
-  def _vi: FLAT_Real_vi_ex_gfx9<op, ps, has_sccb>;
-  def _gfx9: FLAT_Real_gfx9<op, ps, has_sccb>;
-}
-
 multiclass FLAT_Real_AllAddr_vi<bits<7> op,
   bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
-  defm "" : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
-  defm _SADDR : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
-}
-
-multiclass FLAT_Real_AllAddr_vi_ex_gfx9<bits<7> op,
-  bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
-  def _vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
-  def _SADDR_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
+  def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
+  def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
 }
 
 class FLAT_Real_gfx940 <bits<7> op, FLAT_Pseudo ps> :
   FLAT_Real <op, ps>,
   SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX940> {
   let AssemblerPredicate = isGFX940Plus;
-  let DecoderNamespace = "GFX940";
+  let DecoderNamespace = "GFX9";
   let Inst{13} = ps.sve;
   let Inst{25} = !if(ps.has_sccb, cpol{CPolBit.SCC}, ps.sccbValue);
 }
 
 multiclass FLAT_Real_AllAddr_SVE_vi<bits<7> op> {
-  let OtherPredicates = [isGFX8GFX9NotGFX940] in {
-    defm "" : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME)>;
+  def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME)> {
+    let AssemblerPredicate = isGFX8GFX9NotGFX940;
+    let OtherPredicates = [isGFX8GFX9NotGFX940];
+  }
+  def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")> {
+    let DecoderNamespace = "GFX9";
   }
-
-  defm _SADDR_vi : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>;
-
   let AssemblerPredicate = isGFX940Plus in {
     def _VE_gfx940  : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME)>;
     def _SVS_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SVS")>;
@@ -2751,11 +2728,11 @@ multiclass FLAT_Real_AllAddr_LDS<bits<7> op, bits<7> pre_gfx940_op,
   bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
 
   let OtherPredicates = [isGFX8GFX9NotGFX940] in {
-    let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME).AsmOperands # " lds" in {
-      defm "" : FLAT_Real_mc_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
+    def _vi : FLAT_Real_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME), has_sccb> {
+      let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME).AsmOperands # " lds";
     }
-    let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME#"_SADDR").AsmOperands # " lds" in {
-      defm _SADDR : FLAT_Real_mc_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
+    def _SADDR_vi : FLAT_Real_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb> {
+      let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME#"_SADDR").AsmOperands # " lds";
     }
   }
 
@@ -2771,66 +2748,47 @@ multiclass FLAT_Real_AllAddr_SVE_LDS<bits<7> op, bits<7> pre_gfx940_op> {
   def _ST_gfx940  : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_ST")>;
 }
 
-defm FLAT_LOAD_UBYTE_vi         : FLAT_Real_mc_vi <0x10, FLAT_LOAD_UBYTE>;
-defm FLAT_LOAD_SBYTE_vi         : FLAT_Real_mc_vi <0x11, FLAT_LOAD_SBYTE>;
-defm FLAT_LOAD_USHORT_vi        : FLAT_Real_mc_vi <0x12, FLAT_LOAD_USHORT>;
-defm FLAT_LOAD_SSHORT_vi        : FLAT_Real_mc_vi <0x13, FLAT_LOAD_SSHORT>;
-defm FLAT_LOAD_DWORD_vi         : FLAT_Real_mc_vi <0x14, FLAT_LOAD_DWORD>;
-defm FLAT_LOAD_DWORDX2_vi       : FLAT_Real_mc_vi <0x15, FLAT_LOAD_DWORDX2>;
-defm FLAT_LOAD_DWORDX4_vi       : FLAT_Real_mc_vi <0x17, FLAT_LOAD_DWORDX4>;
-defm FLAT_LOAD_DWORDX3_vi       : FLAT_Real_mc_vi <0x16, FLAT_LOAD_DWORDX3>;
-
-defm FLAT_STORE_BYTE_vi         : FLAT_Real_mc_vi <0x18, FLAT_STORE_BYTE>;
-defm FLAT_STORE_BYTE_D16_HI_vi  : FLAT_Real_mc_vi <0x19, FLAT_STORE_BYTE_D16_HI>;
-defm FLAT_STORE_SHORT_vi        : FLAT_Real_mc_vi <0x1a, FLAT_STORE_SHORT>;
-defm FLAT_STORE_SHORT_D16_HI_vi : FLAT_Real_mc_vi <0x1b, FLAT_STORE_SHORT_D16_HI>;
-defm FLAT_STORE_DWORD_vi        : FLAT_Real_mc_vi <0x1c, FLAT_STORE_DWORD>;
-defm FLAT_STORE_DWORDX2_vi      : FLAT_Real_mc_vi <0x1d, FLAT_STORE_DWORDX2>;
-defm FLAT_STORE_DWORDX4_vi      : FLAT_Real_mc_vi <0x1f, FLAT_STORE_DWORDX4>;
-defm FLAT_STORE_DWORDX3_vi      : FLAT_Real_mc_vi <0x1e, FLAT_STORE_DWORDX3>;
-
-defm FLAT_LOAD_UBYTE_D16_vi    : FLAT_Real_mc_vi <0x20, FLAT_LOAD_UBYTE_D16>;
-defm FLAT_LOAD_UBYTE_D16_HI_vi : FLAT_Real_mc_vi <0x21, FLAT_LOAD_UBYTE_D16_HI>;
-defm FLAT_LOAD_SBYTE_D16_vi    : FLAT_Real_mc_vi <0x22, FLAT_LOAD_SBYTE_D16>;
-defm FLAT_LOAD_SBYTE_D16_HI_vi : FLAT_Real_mc_vi <0x23, FLAT_LOAD_SBYTE_D16_HI>;
-defm FLAT_LOAD_SHORT_D16_vi    : FLAT_Real_mc_vi <0x24, FLAT_LOAD_SHORT_D16>;
-defm FLAT_LOAD_SHORT_D16_HI_vi : FLAT_Real_mc_vi <0x25, FLAT_LOAD_SHORT_D16_HI>;
+def FLAT_LOAD_UBYTE_vi         : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>;
+def FLAT_LOAD_SBYTE_vi         : FLAT_Real_vi <0x11, FLAT_LOAD_SBYTE>;
+def FLAT_LOAD_USHORT_vi        : FLAT_Real_vi <0x12, FLAT_LOAD_USHORT>;
+def FLAT_LOAD_SSHORT_vi        : FLAT_Real_vi <0x13, FLAT_LOAD_SSHORT>;
+def FLAT_LOAD_DWORD_vi         : FLAT_Real_vi <0x14, FLAT_LOAD_DWORD>;
+def FLAT_LOAD_DWORDX2_vi       : FLAT_Real_vi <0x15, FLAT_LOAD_DWORDX2>;
+def FLAT_LOAD_DWORDX4_vi       : FLAT_Real_vi <0x17, FLAT_LOAD_DWORDX4>;
+def FLAT_LOAD_DWORDX3_vi       : FLAT_Real_vi <0x16, FLAT_LOAD_DWORDX3>;
+
+def FLAT_STORE_BYTE_vi         : FLAT_Real_vi <0x18, FLAT_STORE_BYTE>;
+def FLAT_STORE_BYTE_D16_HI_vi  : FLAT_Real_vi <0x19, FLAT_STORE_BYTE_D16_HI>;
+def FLAT_STORE_SHORT_vi        : FLAT_Real_vi <0x1a, FLAT_STORE_SHORT>;
+def FLAT_STORE_SHORT_D16_HI_vi : FLAT_Real_vi <0x1b, FLAT_STORE_SHORT_D16_HI>;
+def FLAT_STORE_DWORD_vi        : FLAT_Real_vi <0x1c, FLAT_STORE_DWORD>;
+def FLAT_STORE_DWORDX2_vi      : FLAT_Real_vi <0x1d, FLAT_STORE_DWORDX2>;
+def FLAT_STORE_DWORDX4_vi      : FLAT_Real_vi <0x1f, FLAT_STORE_DWORDX4>;
+def FLAT_STORE_DWORDX3_vi      : FLAT_Real_vi <0x1e, FLAT_STORE_DWORDX3>;
+
+def FLAT_LOAD_UBYTE_D16_vi    : FLAT_Real_vi <0x20, FLAT_LOAD_UBYTE_D16>;
+def FLAT_LOAD_UBYTE_D16_HI_vi : FLAT_Real_vi <0x21, FLAT_LOAD_UBYTE_D16_HI>;
+def FLAT_LOAD_SBYTE_D16_vi    : FLAT_Real_vi <0x22, FLAT_LOAD_SBYTE_D16>;
+def FLAT_LOAD_SBYTE_D16_HI_vi : FLAT_Real_vi <0x23, FLAT_LOAD_SBYTE_D16_HI>;
+def FLAT_LOAD_SHORT_D16_vi    : FLAT_Real_vi <0x24, FLAT_LOAD_SHORT_D16>;
+def FLAT_LOAD_SHORT_D16_HI_vi : FLAT_Real_vi <0x25, FLAT_LOAD_SHORT_D16_HI>;
 
 multiclass FLAT_Real_Atomics_vi <bits<7> op,
   bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
   defvar ps = !cast<FLAT_Pseudo>(NAME);
-  defm "" : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
-  defm _RTN : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
-  def _RTN_agpr_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>;
-}
-
-multiclass FLAT_Real_Atomics_vi_ex_gfx9 <bits<7> op,
-  bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
-  defvar ps = !cast<FLAT_Pseudo>(NAME);
-  def _vi     : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
-  def _RTN_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
-
-  def _RTN_agpr_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>;
+  def _vi     : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
+  def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
+  def _RTN_agpr_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>;
 }
 
 multiclass FLAT_Global_Real_Atomics_vi<bits<7> op,
   bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> :
   FLAT_Real_AllAddr_vi<op, has_sccb> {
-  defm _RTN  : FLAT_Real_mc_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>;
-  defm _SADDR_RTN : FLAT_Real_mc_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>;
-
-  def _RTN_agpr_vi  : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>;
-  def _SADDR_RTN_agpr_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>;
-}
-
-multiclass FLAT_Global_Real_Atomics_vi_ex_gfx9<bits<7> op,
-  bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> :
-  FLAT_Real_AllAddr_vi_ex_gfx9<op, has_sccb> {
-  def _RTN_vi  : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>;
-  def _SADDR_RTN_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>;
+  def _RTN_vi  : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>;
+  def _SADDR_RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>;
 
-  def _RTN_agpr_vi  : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>;
-  def _SADDR_RTN_agpr_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>;
+  def _RTN_agpr_vi  : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>;
+  def _SADDR_RTN_agpr_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>;
 }
 
 defm FLAT_ATOMIC_SWAP       : FLAT_Real_Atomics_vi <0x40>;
@@ -2992,10 +2950,10 @@ let AssemblerPredicate = isGFX940Plus in {
   defm GLOBAL_ATOMIC_ADD_F64     : FLAT_Global_Real_Atomics_gfx940<0x4f>;
   defm GLOBAL_ATOMIC_MIN_F64     : FLAT_Global_Real_Atomics_gfx940<0x50>;
   defm GLOBAL_ATOMIC_MAX_F64     : FLAT_Global_Real_Atomics_gfx940<0x51>;
-  defm FLAT_ATOMIC_ADD_F32       : FLAT_Real_Atomics_vi_ex_gfx9<0x4d>;
-  defm FLAT_ATOMIC_PK_ADD_F16    : FLAT_Real_Atomics_vi_ex_gfx9<0x4e>;
-  defm FLAT_ATOMIC_PK_ADD_BF16   : FLAT_Real_Atomics_vi_ex_gfx9<0x52>;
-  defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Real_Atomics_vi_ex_gfx9<0x52>;
+  defm FLAT_ATOMIC_ADD_F32       : FLAT_Real_Atomics_vi<0x4d>;
+  defm FLAT_ATOMIC_PK_ADD_F16    : FLAT_Real_Atomics_vi<0x4e>;
+  defm FLAT_ATOMIC_PK_ADD_BF16   : FLAT_Real_Atomics_vi<0x52>;
+  defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Real_Atomics_vi<0x52>;
 } // End AssemblerPredicate = isGFX940Plus
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 3e6f35d..703ec0a 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -186,12 +186,8 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
     O << " dlc";
   if ((Imm & CPol::SCC) && AMDGPU::isGFX90A(STI))
     O << (AMDGPU::isGFX940(STI) ? " sc1" : " scc");
-  if (Imm & ~CPol::ALL_pregfx12) {
-    if ((Imm & CPol::NV) && AMDGPU::isGFX9(STI) && !AMDGPU::isGFX90A(STI))
-      O << " nv";
-    else
-      O << " /* unexpected cache policy bit */";
-  }
+  if (Imm & ~CPol::ALL_pregfx12)
+    O << " /* unexpected cache policy bit */";
 }
 
 void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope,
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index c89212d..90a4723 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -756,6 +756,155 @@ LoongArchInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
   return ArrayRef(TargetFlags);
 }
 
+bool LoongArchInstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
+                                             Register Reg,
+                                             const MachineInstr &AddrI,
+                                             ExtAddrMode &AM) const {
+  enum MemIOffsetType {
+    Imm14Shift2,
+    Imm12,
+    Imm11Shift1,
+    Imm10Shift2,
+    Imm9Shift3,
+    Imm8,
+    Imm8Shift1,
+    Imm8Shift2,
+    Imm8Shift3
+  };
+
+  MemIOffsetType OT;
+  switch (MemI.getOpcode()) {
+  default:
+    return false;
+  case LoongArch::LDPTR_W:
+  case LoongArch::LDPTR_D:
+  case LoongArch::STPTR_W:
+  case LoongArch::STPTR_D:
+    OT = Imm14Shift2;
+    break;
+  case LoongArch::LD_B:
+  case LoongArch::LD_H:
+  case LoongArch::LD_W:
+  case LoongArch::LD_D:
+  case LoongArch::LD_BU:
+  case LoongArch::LD_HU:
+  case LoongArch::LD_WU:
+  case LoongArch::ST_B:
+  case LoongArch::ST_H:
+  case LoongArch::ST_W:
+  case LoongArch::ST_D:
+  case LoongArch::FLD_S:
+  case LoongArch::FLD_D:
+  case LoongArch::FST_S:
+  case LoongArch::FST_D:
+  case LoongArch::VLD:
+  case LoongArch::VST:
+  case LoongArch::XVLD:
+  case LoongArch::XVST:
+  case LoongArch::VLDREPL_B:
+  case LoongArch::XVLDREPL_B:
+    OT = Imm12;
+    break;
+  case LoongArch::VLDREPL_H:
+  case LoongArch::XVLDREPL_H:
+    OT = Imm11Shift1;
+    break;
+  case LoongArch::VLDREPL_W:
+  case LoongArch::XVLDREPL_W:
+    OT = Imm10Shift2;
+    break;
+  case LoongArch::VLDREPL_D:
+  case LoongArch::XVLDREPL_D:
+    OT = Imm9Shift3;
+    break;
+  case LoongArch::VSTELM_B:
+  case LoongArch::XVSTELM_B:
+    OT = Imm8;
+    break;
+  case LoongArch::VSTELM_H:
+  case LoongArch::XVSTELM_H:
+    OT = Imm8Shift1;
+    break;
+  case LoongArch::VSTELM_W:
+  case LoongArch::XVSTELM_W:
+    OT = Imm8Shift2;
+    break;
+  case LoongArch::VSTELM_D:
+  case LoongArch::XVSTELM_D:
+    OT = Imm8Shift3;
+    break;
+  }
+
+  if (MemI.getOperand(0).getReg() == Reg)
+    return false;
+
+  if ((AddrI.getOpcode() != LoongArch::ADDI_W &&
+       AddrI.getOpcode() != LoongArch::ADDI_D) ||
+      !AddrI.getOperand(1).isReg() || !AddrI.getOperand(2).isImm())
+    return false;
+
+  int64_t OldOffset = MemI.getOperand(2).getImm();
+  int64_t Disp = AddrI.getOperand(2).getImm();
+  int64_t NewOffset = OldOffset + Disp;
+  if (!STI.is64Bit())
+    NewOffset = SignExtend64<32>(NewOffset);
+
+  if (!(OT == Imm14Shift2 && isShiftedInt<14, 2>(NewOffset) && STI.hasUAL()) &&
+      !(OT == Imm12 && isInt<12>(NewOffset)) &&
+      !(OT == Imm11Shift1 && isShiftedInt<11, 1>(NewOffset)) &&
+      !(OT == Imm10Shift2 && isShiftedInt<10, 2>(NewOffset)) &&
+      !(OT == Imm9Shift3 && isShiftedInt<9, 3>(NewOffset)) &&
+      !(OT == Imm8 && isInt<8>(NewOffset)) &&
+      !(OT == Imm8Shift1 && isShiftedInt<8, 1>(NewOffset)) &&
+      !(OT == Imm8Shift2 && isShiftedInt<8, 2>(NewOffset)) &&
+      !(OT == Imm8Shift3 && isShiftedInt<8, 3>(NewOffset)))
+    return false;
+
+  AM.BaseReg = AddrI.getOperand(1).getReg();
+  AM.ScaledReg = 0;
+  AM.Scale = 0;
+  AM.Displacement = NewOffset;
+  AM.Form = ExtAddrMode::Formula::Basic;
+  return true;
+}
+
+MachineInstr *
+LoongArchInstrInfo::emitLdStWithAddr(MachineInstr &MemI,
+                                     const ExtAddrMode &AM) const {
+  const DebugLoc &DL = MemI.getDebugLoc();
+  MachineBasicBlock &MBB = *MemI.getParent();
+
+  assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
+         "Addressing mode not supported for folding");
+
+  unsigned MemIOp = MemI.getOpcode();
+  switch (MemIOp) {
+  default:
+    return BuildMI(MBB, MemI, DL, get(MemIOp))
+        .addReg(MemI.getOperand(0).getReg(),
+                MemI.mayLoad() ? RegState::Define : 0)
+        .addReg(AM.BaseReg)
+        .addImm(AM.Displacement)
+        .setMemRefs(MemI.memoperands())
+        .setMIFlags(MemI.getFlags());
+  case LoongArch::VSTELM_B:
+  case LoongArch::VSTELM_H:
+  case LoongArch::VSTELM_W:
+  case LoongArch::VSTELM_D:
+  case LoongArch::XVSTELM_B:
+  case LoongArch::XVSTELM_H:
+  case LoongArch::XVSTELM_W:
+  case LoongArch::XVSTELM_D:
+    return BuildMI(MBB, MemI, DL, get(MemIOp))
+        .addReg(MemI.getOperand(0).getReg(), 0)
+        .addReg(AM.BaseReg)
+        .addImm(AM.Displacement)
+        .addImm(MemI.getOperand(3).getImm())
+        .setMemRefs(MemI.memoperands())
+        .setMIFlags(MemI.getFlags());
+  }
+}
+
 // Returns true if this is the sext.w pattern, addi.w rd, rs, 0.
 bool LoongArch::isSEXT_W(const MachineInstr &MI) {
   return MI.getOpcode() == LoongArch::ADDI_W && MI.getOperand(1).isReg() &&
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
index f25958a..f69a558 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
@@ -93,6 +93,12 @@ public:
   ArrayRef<std::pair<unsigned, const char *>>
   getSerializableBitmaskMachineOperandTargetFlags() const override;
 
+  bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg,
+                           const MachineInstr &AddrI,
+                           ExtAddrMode &AM) const override;
+  MachineInstr *emitLdStWithAddr(MachineInstr &MemI,
+                                 const ExtAddrMode &AM) const override;
+
 protected:
   const LoongArchSubtarget &STI;
 };
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index 9de4c9d..92a9388 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -62,6 +62,11 @@ static cl::opt<bool>
                           cl::desc("Enable the merge base offset pass"),
                           cl::init(true), cl::Hidden);
 
+static cl::opt<bool>
+    EnableSinkFold("loongarch-enable-sink-fold",
+                   cl::desc("Enable sinking and folding of instruction copies"),
+                   cl::init(true), cl::Hidden);
+
 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
   return RM.value_or(Reloc::Static);
 }
@@ -146,7 +151,9 @@ namespace {
 class LoongArchPassConfig : public TargetPassConfig {
 public:
   LoongArchPassConfig(LoongArchTargetMachine &TM, PassManagerBase &PM)
-      : TargetPassConfig(TM, PM) {}
+      : TargetPassConfig(TM, PM) {
+    setEnableSinkAndFold(EnableSinkFold);
+  }
 
   LoongArchTargetMachine &getLoongArchTargetMachine() const {
     return getTM<LoongArchTargetMachine>();
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index c3f100e..995ae75 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16496,32 +16496,42 @@ static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG,
 }
 
 static SDValue getShlAddShlAdd(SDNode *N, SelectionDAG &DAG, unsigned ShX,
-                               unsigned ShY) {
+                               unsigned ShY, bool AddX) {
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
   SDValue X = N->getOperand(0);
   SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
                                DAG.getTargetConstant(ShY, DL, VT), X);
   return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
-                     DAG.getTargetConstant(ShX, DL, VT), Mul359);
+                     DAG.getTargetConstant(ShX, DL, VT), AddX ? X : Mul359);
 }
 
 static SDValue expandMulToShlAddShlAdd(SDNode *N, SelectionDAG &DAG,
                                        uint64_t MulAmt) {
+  // 3/5/9 * 3/5/9 -> (shXadd (shYadd X, X), (shYadd X, X))
   switch (MulAmt) {
   case 5 * 3:
-    return getShlAddShlAdd(N, DAG, 2, 1);
+    return getShlAddShlAdd(N, DAG, 2, 1, /*AddX=*/false);
   case 9 * 3:
-    return getShlAddShlAdd(N, DAG, 3, 1);
+    return getShlAddShlAdd(N, DAG, 3, 1, /*AddX=*/false);
   case 5 * 5:
-    return getShlAddShlAdd(N, DAG, 2, 2);
+    return getShlAddShlAdd(N, DAG, 2, 2, /*AddX=*/false);
   case 9 * 5:
-    return getShlAddShlAdd(N, DAG, 3, 2);
+    return getShlAddShlAdd(N, DAG, 3, 2, /*AddX=*/false);
   case 9 * 9:
-    return getShlAddShlAdd(N, DAG, 3, 3);
+    return getShlAddShlAdd(N, DAG, 3, 3, /*AddX=*/false);
   default:
-    return SDValue();
+    break;
   }
+
+  // 2/4/8 * 3/5/9 + 1 -> (shXadd (shYadd X, X), X)
+  int ShX;
+  if (int ShY = isShifted359(MulAmt - 1, ShX)) {
+    assert(ShX != 0 && "MulAmt=4,6,10 handled before");
+    if (ShX <= 3)
+      return getShlAddShlAdd(N, DAG, ShX, ShY, /*AddX=*/true);
+  }
+  return SDValue();
 }
 
 // Try to expand a scalar multiply to a faster sequence.
@@ -16581,41 +16591,30 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
                          DAG.getConstant(Shift, DL, VT));
     }
 
-    // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X)
-    if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt))
-      return V;
+    // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples
+    // of 25 which happen to be quite common.
+    // (2/4/8 * 3/5/9 + 1) * 2^N
+    Shift = llvm::countr_zero(MulAmt);
+    if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt >> Shift)) {
+      if (Shift == 0)
+        return V;
+      SDLoc DL(N);
+      return DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Shift, DL, VT));
+    }
 
     // If this is a power 2 + 2/4/8, we can use a shift followed by a single
     // shXadd. First check if this a sum of two power of 2s because that's
     // easy. Then count how many zeros are up to the first bit.
-    if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
-      unsigned ScaleShift = llvm::countr_zero(MulAmt);
-      if (ScaleShift >= 1 && ScaleShift < 4) {
-        unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
-        SDLoc DL(N);
-        SDValue Shift1 =
-            DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
-        return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                           DAG.getTargetConstant(ScaleShift, DL, VT), Shift1);
-      }
+    if (Shift >= 1 && Shift <= 3 && isPowerOf2_64(MulAmt & (MulAmt - 1))) {
+      unsigned ShiftAmt = llvm::countr_zero((MulAmt & (MulAmt - 1)));
+      SDLoc DL(N);
+      SDValue Shift1 =
+          DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
+      return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+                         DAG.getTargetConstant(Shift, DL, VT), Shift1);
     }
 
-    // 2^(1,2,3) * 3,5,9 + 1 -> (shXadd (shYadd x, x), x)
-    // This is the two instruction form, there are also three instruction
-    // variants we could implement.  e.g.
-    //   (2^(1,2,3) * 3,5,9 + 1) << C2
-    //   2^(C1>3) * 3,5,9 +/- 1
-    if (int ShXAmount = isShifted359(MulAmt - 1, Shift)) {
-      assert(Shift != 0 && "MulAmt=4,6,10 handled before");
-      if (Shift <= 3) {
-        SDLoc DL(N);
-        SDValue Mul359 =
-            DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                        DAG.getTargetConstant(ShXAmount, DL, VT), X);
-        return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
-                           DAG.getTargetConstant(Shift, DL, VT), X);
-      }
-    }
+    // TODO: 2^(C1>3) * 3,5,9 +/- 1
 
     // 2^n + 2/4/8 + 1 -> (add (shl X, C1), (shXadd X, X))
     if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) {
@@ -16647,14 +16646,6 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
         return DAG.getNode(ISD::SUB, DL, VT, Shift1, Mul359);
       }
     }
-
-    // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples
-    // of 25 which happen to be quite common.
-    Shift = llvm::countr_zero(MulAmt);
-    if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt >> Shift)) {
-      SDLoc DL(N);
-      return DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Shift, DL, VT));
-    }
   }
 
   if (SDValue V = expandMulToAddOrSubOfShl(N, DAG, MulAmt))
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 56a38bb..b2cbdb2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -2390,6 +2390,15 @@ static bool generateBindlessImageINTELInst(const SPIRV::IncomingCall *Call,
   return buildBindlessImageINTELInst(Call, Opcode, MIRBuilder, GR);
 }
 
+static bool generateBlockingPipesInst(const SPIRV::IncomingCall *Call,
+                                      MachineIRBuilder &MIRBuilder,
+                                      SPIRVGlobalRegistry *GR) {
+  const SPIRV::DemangledBuiltin *Builtin = Call->Builtin;
+  unsigned Opcode =
+      SPIRV::lookupNativeBuiltin(Builtin->Name, Builtin->Set)->Opcode;
+  return buildOpFromWrapper(MIRBuilder, Opcode, Call, Register(0));
+}
+
 static bool
 generateTernaryBitwiseFunctionINTELInst(const SPIRV::IncomingCall *Call,
                                         MachineIRBuilder &MIRBuilder,
@@ -3050,6 +3059,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
     return generatePipeInst(Call.get(), MIRBuilder, GR);
   case SPIRV::PredicatedLoadStore:
     return generatePredicatedLoadStoreInst(Call.get(), MIRBuilder, GR);
+  case SPIRV::BlockingPipes:
+    return generateBlockingPipesInst(Call.get(), MIRBuilder, GR);
   }
   return false;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index c259cce..492a98e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -71,6 +71,7 @@ def TernaryBitwiseINTEL : BuiltinGroup;
 def Block2DLoadStore : BuiltinGroup;
 def Pipe : BuiltinGroup;
 def PredicatedLoadStore : BuiltinGroup;
+def BlockingPipes : BuiltinGroup;
 
 //===----------------------------------------------------------------------===//
 // Class defining a demangled builtin record. The information in the record
@@ -1174,6 +1175,10 @@ defm : DemangledNativeBuiltin<"clock_read_sub_group", OpenCL_std, KernelClock, 0
 defm : DemangledNativeBuiltin<"clock_read_hilo_device", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
 defm : DemangledNativeBuiltin<"clock_read_hilo_work_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
 defm : DemangledNativeBuiltin<"clock_read_hilo_sub_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
+
+//SPV_ALTERA_blocking_pipes
+defm : DemangledNativeBuiltin<"__spirv_WritePipeBlockingINTEL", OpenCL_std, BlockingPipes, 0, 0, OpWritePipeBlockingALTERA>;
+defm : DemangledNativeBuiltin<"__spirv_ReadPipeBlockingINTEL", OpenCL_std, BlockingPipes, 0, 0, OpReadPipeBlockingALTERA>;
 defm : DemangledNativeBuiltin<"__spirv_ReadClockKHR", OpenCL_std, KernelClock, 1, 1, OpReadClockKHR>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 43b2869..f681b0d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -159,7 +159,9 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
         {"SPV_KHR_maximal_reconvergence",
          SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence},
         {"SPV_INTEL_kernel_attributes",
-         SPIRV::Extension::Extension::SPV_INTEL_kernel_attributes}};
+         SPIRV::Extension::Extension::SPV_INTEL_kernel_attributes},
+        {"SPV_ALTERA_blocking_pipes",
+         SPIRV::Extension::Extension::SPV_ALTERA_blocking_pipes}};
 
 bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName,
                                   StringRef ArgValue,
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index a61351e..03bd61b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -993,3 +993,9 @@ def OpPredicatedLoadINTEL: Op<6528, (outs ID:$res), (ins TYPE:$resType, ID:$ptr,
                   "$res = OpPredicatedLoadINTEL $resType $ptr $predicate $default_value">;
 def OpPredicatedStoreINTEL: Op<6529, (outs), (ins ID:$ptr, ID:$object, ID:$predicate, variable_ops),
                   "OpPredicatedStoreINTEL $ptr $object $predicate">;
+
+//SPV_ALTERA_blocking_pipes
+def OpReadPipeBlockingALTERA :Op<5946, (outs), (ins ID:$pipe, ID:$pointer, ID:$packetSize, ID:$packetAlignment),
+                   "OpReadPipeBlockingALTERA $pipe $pointer $packetSize $packetAlignment">;
+def OpWritePipeBlockingALTERA :Op<5946, (outs), (ins ID:$pipe, ID:$pointer, ID:$packetSize, ID:$packetAlignment),
+                   "OpWritePipeBlockingALTERA $pipe $pointer $packetSize $packetAlignment">;
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index e5ac76c4..af76016 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1885,6 +1885,13 @@ void addInstrRequirements(const MachineInstr &MI,
     Reqs.addCapability(
         SPIRV::Capability::CooperativeMatrixCheckedInstructionsINTEL);
     break;
+  case SPIRV::OpReadPipeBlockingALTERA:
+  case SPIRV::OpWritePipeBlockingALTERA:
+    if (ST.canUseExtension(SPIRV::Extension::SPV_ALTERA_blocking_pipes)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_ALTERA_blocking_pipes);
+      Reqs.addCapability(SPIRV::Capability::BlockingPipesALTERA);
+    }
+    break;
   case SPIRV::OpCooperativeMatrixGetElementCoordINTEL:
     if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_joint_matrix))
       report_fatal_error("OpCooperativeMatrixGetElementCoordINTEL requires the "
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index 4e4e6fb..be88f33 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -56,6 +56,13 @@ public:
   }
 };
 
+static cl::list<std::string> SPVAllowUnknownIntrinsics(
+    "spv-allow-unknown-intrinsics", cl::CommaSeparated,
+    cl::desc("Emit unknown intrinsics as calls to external functions. A "
+             "comma-separated input list of intrinsic prefixes must be "
+             "provided, and only intrinsics carrying a listed prefix get "
+             "emitted as described."),
+    cl::value_desc("intrinsic_prefix_0,intrinsic_prefix_1"), cl::ValueOptional);
 } // namespace
 
 char SPIRVPrepareFunctions::ID = 0;
@@ -445,6 +452,15 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
                                        EraseFromParent);
         Changed = true;
         break;
+      default:
+        if (TM.getTargetTriple().getVendor() == Triple::AMD ||
+            any_of(SPVAllowUnknownIntrinsics, [II](auto &&Prefix) {
+              if (Prefix.empty())
+                return false;
+              return II->getCalledFunction()->getName().starts_with(Prefix);
+            }))
+          Changed |= lowerIntrinsicToFunction(II);
+        break;
       }
     }
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 1b4b29b..65a8885 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -309,7 +309,7 @@ defm SPV_KHR_shader_clock : ExtensionOperand<54, [EnvVulkan, EnvOpenCL]>;
 defm SPV_INTEL_unstructured_loop_controls : ExtensionOperand<55, [EnvOpenCL]>;
 defm SPV_EXT_demote_to_helper_invocation : ExtensionOperand<56, [EnvVulkan]>;
 defm SPV_INTEL_fpga_reg : ExtensionOperand<57, [EnvOpenCL]>;
-defm SPV_INTEL_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>;
+defm SPV_ALTERA_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>;
 defm SPV_GOOGLE_user_type : ExtensionOperand<59, [EnvVulkan]>;
 defm SPV_KHR_physical_storage_buffer : ExtensionOperand<60, [EnvVulkan]>;
 defm SPV_INTEL_kernel_attributes : ExtensionOperand<61, [EnvOpenCL]>;
@@ -611,6 +611,7 @@ defm TensorFloat32RoundingINTEL : CapabilityOperand<6425, 0, 0, [SPV_INTEL_tenso
 defm BFloat16TypeKHR : CapabilityOperand<5116, 0, 0, [SPV_KHR_bfloat16], []>;
 defm BFloat16DotProductKHR : CapabilityOperand<5117, 0, 0, [SPV_KHR_bfloat16], [BFloat16TypeKHR]>;
 defm BFloat16CooperativeMatrixKHR : CapabilityOperand<5118, 0, 0, [SPV_KHR_bfloat16], [BFloat16TypeKHR, CooperativeMatrixKHR]>;
+defm BlockingPipesALTERA : CapabilityOperand<5945, 0, 0, [SPV_ALTERA_blocking_pipes], []>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define SourceLanguage enum values and at the same time
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index af32298..fc6c290 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -216,7 +216,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     // Combine fp_to_{s,u}int_sat or fp_round of concat_vectors or vice versa
     // into conversion ops
     setTargetDAGCombine({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT,
-                         ISD::FP_ROUND, ISD::CONCAT_VECTORS});
+                         ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_ROUND,
+                         ISD::CONCAT_VECTORS});
 
     setTargetDAGCombine(ISD::TRUNCATE);
 
@@ -3580,6 +3581,64 @@ static SDValue performMulCombine(SDNode *N,
   }
 }
 
+SDValue DoubleVectorWidth(SDValue In, unsigned RequiredNumElems,
+                          SelectionDAG &DAG) {
+  SDLoc DL(In);
+  LLVMContext &Ctx = *DAG.getContext();
+  EVT InVT = In.getValueType();
+  unsigned NumElems = InVT.getVectorNumElements() * 2;
+  EVT OutVT = EVT::getVectorVT(Ctx, InVT.getVectorElementType(), NumElems);
+  SDValue Concat =
+      DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, In, DAG.getPOISON(InVT));
+  if (NumElems < RequiredNumElems) {
+    return DoubleVectorWidth(Concat, RequiredNumElems, DAG);
+  }
+  return Concat;
+}
+
+SDValue performConvertFPCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT OutVT = N->getValueType(0);
+  if (!OutVT.isVector())
+    return SDValue();
+
+  EVT OutElTy = OutVT.getVectorElementType();
+  if (OutElTy != MVT::i8 && OutElTy != MVT::i16)
+    return SDValue();
+
+  unsigned NumElems = OutVT.getVectorNumElements();
+  if (!isPowerOf2_32(NumElems))
+    return SDValue();
+
+  EVT FPVT = N->getOperand(0)->getValueType(0);
+  if (FPVT.getVectorElementType() != MVT::f32)
+    return SDValue();
+
+  SDLoc DL(N);
+
+  // First, convert to i32.
+  LLVMContext &Ctx = *DAG.getContext();
+  EVT IntVT = EVT::getVectorVT(Ctx, MVT::i32, NumElems);
+  SDValue ToInt = DAG.getNode(N->getOpcode(), DL, IntVT, N->getOperand(0));
+  APInt Mask = APInt::getLowBitsSet(IntVT.getScalarSizeInBits(),
+                                    OutVT.getScalarSizeInBits());
+  // Mask out the top MSBs.
+  SDValue Masked =
+      DAG.getNode(ISD::AND, DL, IntVT, ToInt, DAG.getConstant(Mask, DL, IntVT));
+
+  if (OutVT.getSizeInBits() < 128) {
+    // Create a wide enough vector that we can use narrow.
+    EVT NarrowedVT = OutElTy == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
+    unsigned NumRequiredElems = NarrowedVT.getVectorNumElements();
+    SDValue WideVector = DoubleVectorWidth(Masked, NumRequiredElems, DAG);
+    SDValue Trunc = truncateVectorWithNARROW(NarrowedVT, WideVector, DL, DAG);
+    return DAG.getBitcast(
+        OutVT, extractSubVector(Trunc, 0, DAG, DL, OutVT.getSizeInBits()));
+  } else {
+    return truncateVectorWithNARROW(OutVT, Masked, DL, DAG);
+  }
+  return SDValue();
+}
+
 SDValue
 WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
@@ -3606,6 +3665,9 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::FP_ROUND:
   case ISD::CONCAT_VECTORS:
     return performVectorTruncZeroCombine(N, DCI);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    return performConvertFPCombine(N, DCI.DAG);
   case ISD::TRUNCATE:
     return performTruncateCombine(N, DCI);
   case ISD::INTRINSIC_WO_CHAIN:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e5c3f17..906fa2f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7550,13 +7550,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
   }
   if (LoadInst *Load = dyn_cast<LoadInst>(I))
     return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
-                                 Load->getAlign(), VPIRMetadata(*Load, LVer),
-                                 I->getDebugLoc());
+                                 VPIRMetadata(*Load, LVer), I->getDebugLoc());
 
   StoreInst *Store = cast<StoreInst>(I);
   return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
-                                Reverse, Store->getAlign(),
-                                VPIRMetadata(*Store, LVer), I->getDebugLoc());
+                                Reverse, VPIRMetadata(*Store, LVer),
+                                I->getDebugLoc());
 }
 
 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 22ea083..bbb03fb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1163,10 +1163,10 @@ public:
   bool opcodeMayReadOrWriteFromMemory() const;
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override;
+  bool usesFirstLaneOnly(const VPValue *Op) const override;
 
   /// Returns true if the recipe only uses the first part of operand \p Op.
-  bool onlyFirstPartUsed(const VPValue *Op) const override;
+  bool usesFirstPartOnly(const VPValue *Op) const override;
 
   /// Returns true if this VPInstruction produces a scalar value from a vector,
   /// e.g. by performing a reduction or extracting a lane.
@@ -1393,13 +1393,13 @@ public:
     return true;
   }
 
-  bool onlyFirstPartUsed(const VPValue *Op) const override {
+  bool usesFirstPartOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
   }
 
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
@@ -1628,7 +1628,7 @@ public:
              VPSlotTracker &SlotTracker) const override;
 #endif
 
-  bool onlyFirstLaneUsed(const VPValue *Op) const override;
+  bool usesFirstLaneOnly(const VPValue *Op) const override;
 };
 
 /// A recipe for widening Call instructions using library calls.
@@ -1767,7 +1767,7 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
   }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return Op == getCond() && isInvariantCond();
@@ -1833,7 +1833,7 @@ public:
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     if (Op == getOperand(0))
@@ -1870,7 +1870,7 @@ public:
 
   void execute(VPTransformState &State) override;
 
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
@@ -1884,7 +1884,7 @@ public:
   }
 
   /// Returns true if the recipe only uses the first part of operand \p Op.
-  bool onlyFirstPartUsed(const VPValue *Op) const override {
+  bool usesFirstPartOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     assert(getNumOperands() <= 2 && "must have at most two operands");
@@ -1922,14 +1922,14 @@ public:
 
   Type *getSourceElementType() const { return SourceElementTy; }
 
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
   }
 
   /// Returns true if the recipe only uses the first part of operand \p Op.
-  bool onlyFirstPartUsed(const VPValue *Op) const override {
+  bool usesFirstPartOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     assert(getNumOperands() <= 2 && "must have at most two operands");
@@ -2110,7 +2110,7 @@ public:
   }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     // The recipe creates its own wide start value, so it only requests the
@@ -2325,7 +2325,7 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return Op == getStartValue();
@@ -2399,7 +2399,7 @@ public:
   bool isInLoop() const { return IsInLoop; }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return isOrdered() || isInLoop();
@@ -2468,13 +2468,13 @@ public:
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     // Recursing through Blend recipes only, must terminate at header phi's the
     // latest.
     return all_of(users(),
-                  [this](VPUser *U) { return U->onlyFirstLaneUsed(this); });
+                  [this](VPUser *U) { return U->usesFirstLaneOnly(this); });
   }
 };
 
@@ -2562,7 +2562,7 @@ public:
                               VPCostContext &Ctx) const override;
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override = 0;
+  bool usesFirstLaneOnly(const VPValue *Op) const override = 0;
 
   /// Returns the number of stored operands of this interleave group. Returns 0
   /// for load interleave groups.
@@ -2608,7 +2608,7 @@ public:
              VPSlotTracker &SlotTracker) const override;
 #endif
 
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
@@ -2656,7 +2656,7 @@ public:
 #endif
 
   /// The recipe only uses the first lane of the address, and EVL operand.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return (Op == getAddr() && !llvm::is_contained(getStoredValues(), Op)) ||
@@ -2862,7 +2862,7 @@ public:
   VPValue *getEVL() const { return getOperand(2); }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return Op == getEVL();
@@ -2924,7 +2924,7 @@ public:
   bool isPredicated() const { return IsPredicated; }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return isSingleScalar();
@@ -3206,10 +3206,11 @@ protected:
 
   VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
                       std::initializer_list<VPValue *> Operands,
-                      bool Consecutive, bool Reverse, Align Alignment,
+                      bool Consecutive, bool Reverse,
                       const VPIRMetadata &Metadata, DebugLoc DL)
       : VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I),
-        Alignment(Alignment), Consecutive(Consecutive), Reverse(Reverse) {
+        Alignment(getLoadStoreAlignment(&I)), Consecutive(Consecutive),
+        Reverse(Reverse) {
     assert((Consecutive || !Reverse) && "Reverse implies consecutive");
     assert(isa<VPVectorEndPointerRecipe>(getAddr()) ||
            !Reverse &&
@@ -3273,18 +3274,18 @@ public:
 struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
                                                    public VPValue {
   VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
-                    bool Consecutive, bool Reverse, Align Alignment,
+                    bool Consecutive, bool Reverse,
                     const VPIRMetadata &Metadata, DebugLoc DL)
       : VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive,
-                            Reverse, Alignment, Metadata, DL),
+                            Reverse, Metadata, DL),
         VPValue(this, &Load) {
     setMask(Mask);
   }
 
   VPWidenLoadRecipe *clone() override {
     return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
-                                 getMask(), Consecutive, Reverse, getAlign(),
-                                 *this, getDebugLoc());
+                                 getMask(), Consecutive, Reverse, *this,
+                                 getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC);
@@ -3299,7 +3300,7 @@ struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     // Widened, consecutive loads operations only demand the first lane of
@@ -3315,8 +3316,8 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
   VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL,
                        VPValue *Mask)
       : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
-                            {Addr, &EVL}, L.isConsecutive(), L.isReverse(),
-                            L.getAlign(), L, L.getDebugLoc()),
+                            {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L,
+                            L.getDebugLoc()),
         VPValue(this, &getIngredient()) {
     setMask(Mask);
   }
@@ -3340,7 +3341,7 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     // Widened loads only demand the first lane of EVL and consecutive loads
@@ -3354,16 +3355,16 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
 struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
   VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
                      VPValue *Mask, bool Consecutive, bool Reverse,
-                     Align Alignment, const VPIRMetadata &Metadata, DebugLoc DL)
+                     const VPIRMetadata &Metadata, DebugLoc DL)
       : VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal},
-                            Consecutive, Reverse, Alignment, Metadata, DL) {
+                            Consecutive, Reverse, Metadata, DL) {
     setMask(Mask);
   }
 
   VPWidenStoreRecipe *clone() override {
     return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
                                   getStoredValue(), getMask(), Consecutive,
-                                  Reverse, getAlign(), *this, getDebugLoc());
+                                  Reverse, *this, getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC);
@@ -3381,7 +3382,7 @@ struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     // Widened, consecutive stores only demand the first lane of their address,
@@ -3398,7 +3399,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
                         VPValue *Mask)
       : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
                             {Addr, S.getStoredValue(), &EVL}, S.isConsecutive(),
-                            S.isReverse(), S.getAlign(), S, S.getDebugLoc()) {
+                            S.isReverse(), S, S.getDebugLoc()) {
     setMask(Mask);
   }
 
@@ -3424,7 +3425,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     if (Op == getEVL()) {
@@ -3508,14 +3509,14 @@ public:
   }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
   }
 
   /// Returns true if the recipe only uses the first part of operand \p Op.
-  bool onlyFirstPartUsed(const VPValue *Op) const override {
+  bool usesFirstPartOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
@@ -3590,7 +3591,7 @@ public:
   }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
@@ -3700,7 +3701,7 @@ public:
   VPValue *getStepValue() const { return getOperand(2); }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
@@ -3765,7 +3766,7 @@ public:
   VPValue *getStepValue() const { return getOperand(1); }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f792d0a..80cd112 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1276,7 +1276,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   }
 }
 
-bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
+bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
   if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
     return vputils::onlyFirstLaneUsed(this);
@@ -1325,7 +1325,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   llvm_unreachable("switch should return");
 }
 
-bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
+bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
   if (Instruction::isBinaryOp(getOpcode()))
     return vputils::onlyFirstPartUsed(this);
@@ -1692,7 +1692,7 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
     if (!VFTy->getParamType(I.index())->isVectorTy())
       Arg = State.get(I.value(), VPLane(0));
     else
-      Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
+      Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
     Args.push_back(Arg);
   }
 
@@ -1761,7 +1761,7 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
                                            State.TTI))
       Arg = State.get(I.value(), VPLane(0));
     else
-      Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
+      Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
     if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
                                                State.TTI))
       TysForDecl.push_back(Arg->getType());
@@ -1843,7 +1843,7 @@ StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
   return Intrinsic::getBaseName(VectorIntrinsicID);
 }
 
-bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
+bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
   return all_of(enumerate(operands()), [this, &Op](const auto &X) {
     auto [Idx, V] = X;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8ad772f..48bd697 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -91,14 +91,13 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
         if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
           NewRecipe = new VPWidenLoadRecipe(
               *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
-              false /*Consecutive*/, false /*Reverse*/, Load->getAlign(),
-              VPIRMetadata(*Load), Ingredient.getDebugLoc());
+              false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
+              Ingredient.getDebugLoc());
         } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
           NewRecipe = new VPWidenStoreRecipe(
               *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
               nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
-              Store->getAlign(), VPIRMetadata(*Store),
-              Ingredient.getDebugLoc());
+              VPIRMetadata(*Store), Ingredient.getDebugLoc());
         } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
           NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
         } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
@@ -205,7 +204,7 @@ static bool sinkScalarOperands(VPlan &Plan) {
           return cast<VPRecipeBase>(U)->getParent() != SinkTo;
         });
     if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
-          return !U->onlyFirstLaneUsed(SinkCandidate);
+          return !U->usesFirstLaneOnly(SinkCandidate);
         }))
       continue;
     bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
@@ -4208,7 +4207,7 @@ narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl<VPValue *> &NarrowedOps) {
     auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
     auto *L = new VPWidenLoadRecipe(
         *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
-        /*Reverse=*/false, LI->getAlign(), {}, LoadGroup->getDebugLoc());
+        /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
     L->insertBefore(LoadGroup);
     NarrowedOps.insert(L);
     return L;
@@ -4345,7 +4344,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
         cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
     auto *S = new VPWidenStoreRecipe(
         *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
-        /*Reverse=*/false, SI->getAlign(), {}, StoreGroup->getDebugLoc());
+        /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
     S->insertBefore(StoreGroup);
     StoreGroup->eraseFromParent();
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index d6a0028..d4b8b72b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -582,7 +582,7 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
       /// Users that only demand the first lane can use the definition for lane
       /// 0.
       DefR->replaceUsesWithIf(LaneDefs[0], [DefR](VPUser &U, unsigned) {
-        return U.onlyFirstLaneUsed(DefR);
+        return U.usesFirstLaneOnly(DefR);
       });
 
       // Update each build vector user that currently has DefR as its only
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index c6380d3..e22c5df 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -18,12 +18,12 @@ using namespace llvm::VPlanPatternMatch;
 
 bool vputils::onlyFirstLaneUsed(const VPValue *Def) {
   return all_of(Def->users(),
-                [Def](const VPUser *U) { return U->onlyFirstLaneUsed(Def); });
+                [Def](const VPUser *U) { return U->usesFirstLaneOnly(Def); });
 }
 
 bool vputils::onlyFirstPartUsed(const VPValue *Def) {
   return all_of(Def->users(),
-                [Def](const VPUser *U) { return U->onlyFirstPartUsed(Def); });
+                [Def](const VPUser *U) { return U->usesFirstPartOnly(Def); });
 }
 
 bool vputils::onlyScalarValuesUsed(const VPValue *Def) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 83e3fca..5da7463 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -274,12 +274,12 @@ public:
   virtual bool usesScalars(const VPValue *Op) const {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
-    return onlyFirstLaneUsed(Op);
+    return usesFirstLaneOnly(Op);
   }
 
   /// Returns true if the VPUser only uses the first lane of operand \p Op.
   /// Conservatively returns false.
-  virtual bool onlyFirstLaneUsed(const VPValue *Op) const {
+  virtual bool usesFirstLaneOnly(const VPValue *Op) const {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return false;
@@ -287,7 +287,7 @@ public:
 
   /// Returns true if the VPUser only uses the first part of operand \p Op.
   /// Conservatively returns false.
-  virtual bool onlyFirstPartUsed(const VPValue *Op) const {
+  virtual bool usesFirstPartOnly(const VPValue *Op) const {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return false;
diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
index b2635d3..3685e9c 100644
--- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
@@ -730,111 +730,6 @@ entry:
   ret void
 }
 
-define void @store_factor8(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3,
-                                     <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7) {
-; CHECK-LABEL: store_factor8:
-; CHECK:       .Lfunc_begin17:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0:
-; CHECK:  zip1	[[V1:.*s]], [[I1:.*s]], [[I5:.*s]]
-; CHECK-NEXT:  zip2	[[V5:.*s]], [[I1]], [[I5]]
-; CHECK-NEXT:  zip1	[[V2:.*s]], [[I2:.*s]], [[I6:.*s]]
-; CHECK-NEXT:  zip2 [[V6:.*s]], [[I2]], [[I6]]
-; CHECK-NEXT:  zip1	[[V3:.*s]], [[I3:.*s]], [[I7:.*s]]
-; CHECK-NEXT:  zip2	[[V7:.*s]], [[I3]], [[I7]]
-; CHECK-NEXT:  zip1	[[V4:.*s]], [[I4:.*s]], [[I8:.*s]]
-; CHECK-NEXT:  zip2	[[V8:.*s]], [[I4]], [[I8]]
-; CHECK-NEXT:  st4 { [[V1]], [[V2]], [[V3]], [[V4]] }, [x0], #64
-; CHECK-NEXT:  st4 { [[V5]], [[V6]], [[V7]], [[V8]] }, [x0]
-; CHECK-NEXT:  ret
-
-  %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-
-  %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-
-  %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
-  store <32 x i32> %interleaved.vec, ptr %ptr, align 4
-  ret void
-}
-
-define void @store_factor16(ptr %ptr, <4 x i32> %a0,  <4 x i32> %a1,  <4 x i32> %a2,  <4 x i32> %a3,
-                                      <4 x i32> %a4,  <4 x i32> %a5,  <4 x i32> %a6,  <4 x i32> %a7,
-                                      <4 x i32> %a8,  <4 x i32> %a9,  <4 x i32> %a10, <4 x i32> %a11,
-                                      <4 x i32> %a12, <4 x i32> %a13, <4 x i32> %a14, <4 x i32> %a15) {
-; CHECK-LABEL: store_factor16:
-; CHECK:       .Lfunc_begin18:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0:
-; CHECK:      	zip1	[[V05:.*s]], [[I05:.*s]], [[I13:.*s]]
-; CHECK-NEXT:  	zip1	[[V01:.*s]], [[I01:.*s]], [[I09:.*s]]
-; CHECK-NEXT:  	zip1	[[V02:.*s]], [[I02:.*s]], [[I10:.*s]]
-; CHECK-NEXT:  	zip1	[[V06:.*s]], [[I06:.*s]], [[I14:.*s]]
-; CHECK-NEXT:  	zip1	[[V07:.*s]], [[I07:.*s]], [[I15:.*s]]
-; CHECK-NEXT:  	zip2	[[V09:.*s]], [[I01]], [[I09]]
-; CHECK-NEXT:  	zip2	[[V13:.*s]], [[I05]], [[I13]]
-; CHECK-NEXT:  	zip1	[[V03:.*s]], [[I03:.*s]], [[I11:.*s]]
-; CHECK-NEXT:  	zip1	[[V04:.*s]], [[I04:.*s]], [[I12:.*s]]
-; CHECK-NEXT:  	zip1	[[V08:.*s]], [[I08:.*s]], [[I16:.*s]]
-; CHECK-NEXT:  	zip2	[[V10:.*s]], [[I02]], [[I10]]
-; CHECK-NEXT:  	zip2	[[V14:.*s]], [[I06]], [[I14]]
-; CHECK-NEXT:  	zip2	[[V11:.*s]], [[I03]], [[I11]]
-; CHECK-NEXT:  	zip1	[[V17:.*s]], [[V01]], [[V05]]
-; CHECK-NEXT:  	zip2	[[V15:.*s]], [[I07]], [[I15]]
-; CHECK-NEXT:  	zip2	[[V21:.*s]], [[V01]], [[V05]]
-; CHECK-NEXT:  	zip1	[[V18:.*s]], [[V02]], [[V06]]
-; CHECK-NEXT:  	zip2	[[V12:.*s]], [[I04]], [[I12]]
-; CHECK-NEXT:  	zip2	[[V16:.*s]], [[I08]], [[I16]]
-; CHECK-NEXT:  	zip1	[[V19:.*s]], [[V03]], [[V07]]
-; CHECK-NEXT:  	zip2	[[V22:.*s]], [[V02]], [[V06]]
-; CHECK-NEXT:  	zip1	[[V25:.*s]], [[V09]], [[V13]]
-; CHECK-NEXT:  	zip1	[[V20:.*s]], [[V04]], [[V08]]
-; CHECK-NEXT:  	zip2	[[V23:.*s]], [[V03]], [[V07]]
-; CHECK-NEXT:  	zip1	[[V26:.*s]], [[V10]], [[V14]]
-; CHECK-NEXT:  	zip2	[[V29:.*s]], [[V09]], [[V13]]
-; CHECK-NEXT:  	zip2	[[V24:.*s]], [[V04]], [[V08]]
-; CHECK-NEXT:  	zip1	[[V27:.*s]], [[V11]], [[V15]]
-; CHECK-NEXT:  	zip2	[[V30:.*s]], [[V10]], [[V14]]
-; CHECK-NEXT:  	zip1	[[V28:.*s]], [[V12]], [[V16]]
-; CHECK-NEXT:  	zip2	[[V31:.*s]], [[V11]], [[V15]]
-; CHECK-NEXT:  	zip2	[[V32:.*s]], [[V12]], [[V16]]
-; CHECK-NEXT:  	st4	{ [[V17]], [[V18]], [[V19]], [[V20]] }, [x8], #64
-; CHECK-NEXT:  	ldp	d9, d8, [sp, #48]               // 16-byte Folded Reload
-; CHECK-NEXT:  	ldp	d11, d10, [sp, #32]             // 16-byte Folded Reload
-; CHECK-NEXT:  	st4	{ [[V21]], [[V22]], [[V23]], [[V24]] }, [x8]
-; CHECK-NEXT:  	add	x8, x0, #128
-; CHECK-NEXT:  	ldp	d13, d12, [sp, #16]             // 16-byte Folded Reload
-; CHECK-NEXT:  	st4	{ [[V25]], [[V26]], [[V27]], [[V28]] }, [x8]
-; CHECK-NEXT:  	add	x8, x0, #192
-; CHECK-NEXT:  	st4	{ [[V29]], [[V30]], [[V31]], [[V32]] }, [x8]
-; CHECK-NEXT:  	ldp	d15, d14, [sp], #64             // 16-byte Folded Reload
-; CHECK-NEXT:  	ret
-
-  %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v4 = shufflevector <4 x i32> %a8, <4 x i32> %a9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v5 = shufflevector <4 x i32> %a10, <4 x i32> %a11, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v6 = shufflevector <4 x i32> %a12, <4 x i32> %a13, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v7 = shufflevector <4 x i32> %a14, <4 x i32> %a15, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-
-  %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %s2 = shufflevector <8 x i32> %v4, <8 x i32> %v5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %s3 = shufflevector <8 x i32> %v6, <8 x i32> %v7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-
-  %d0 = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  %d1 = shufflevector <16 x i32> %s2, <16 x i32> %s3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-
-  %interleaved.vec = shufflevector <32 x i32> %d0, <32 x i32> %d1, <64 x i32>  <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
-  store <64 x i32> %interleaved.vec, ptr %ptr, align 4
-  ret void
-}
-
 declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/CodeGen/ARM/ldexp-fp128.ll b/llvm/test/CodeGen/ARM/ldexp-fp128.ll
new file mode 100644
index 0000000..93fcd39e8
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ldexp-fp128.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=armv7-unknown-linux < %s | FileCheck -check-prefix=LINUX %s
+
+define fp128 @testExpl(fp128 %val, i32 %a) {
+; LINUX-LABEL: testExpl:
+; LINUX:       @ %bb.0:
+; LINUX-NEXT:    push {r11, lr}
+; LINUX-NEXT:    sub sp, sp, #8
+; LINUX-NEXT:    ldr r12, [sp, #16]
+; LINUX-NEXT:    str r12, [sp]
+; LINUX-NEXT:    bl ldexpl
+; LINUX-NEXT:    add sp, sp, #8
+; LINUX-NEXT:    pop {r11, pc}
+  %call = tail call fp128 @ldexpl(fp128 %val, i32 %a)
+  ret fp128 %call
+}
+
+declare fp128 @ldexpl(fp128, i32) memory(none)
+
+define fp128 @test_ldexp_f128_i32(fp128 %val, i32 %a) {
+; LINUX-LABEL: test_ldexp_f128_i32:
+; LINUX:       @ %bb.0:
+; LINUX-NEXT:    push {r11, lr}
+; LINUX-NEXT:    sub sp, sp, #8
+; LINUX-NEXT:    ldr r12, [sp, #16]
+; LINUX-NEXT:    str r12, [sp]
+; LINUX-NEXT:    bl ldexpl
+; LINUX-NEXT:    add sp, sp, #8
+; LINUX-NEXT:    pop {r11, pc}
+  %call = tail call fp128 @llvm.ldexp.f128.i32(fp128 %val, i32 %a)
+  ret fp128 %call
+}
+
+define <2 x fp128> @test_ldexp_v2f128_v2i32(<2 x fp128> %val, <2 x i32> %a) {
+; LINUX-LABEL: test_ldexp_v2f128_v2i32:
+; LINUX:       @ %bb.0:
+; LINUX-NEXT:    push {r4, r5, r6, lr}
+; LINUX-NEXT:    vpush {d8}
+; LINUX-NEXT:    sub sp, sp, #8
+; LINUX-NEXT:    mov r5, r3
+; LINUX-NEXT:    add r3, sp, #40
+; LINUX-NEXT:    mov r6, r2
+; LINUX-NEXT:    mov r4, r0
+; LINUX-NEXT:    ldm r3, {r0, r1, r2, r3}
+; LINUX-NEXT:    vldr d8, [sp, #56]
+; LINUX-NEXT:    vst1.32 {d8[1]}, [sp:32]
+; LINUX-NEXT:    bl ldexpl
+; LINUX-NEXT:    ldr r12, [sp, #32]
+; LINUX-NEXT:    vst1.32 {d8[0]}, [sp:32]
+; LINUX-NEXT:    ldr lr, [sp, #36]
+; LINUX-NEXT:    str r0, [r4, #16]
+; LINUX-NEXT:    mov r0, r6
+; LINUX-NEXT:    str r1, [r4, #20]
+; LINUX-NEXT:    mov r1, r5
+; LINUX-NEXT:    str r2, [r4, #24]
+; LINUX-NEXT:    mov r2, r12
+; LINUX-NEXT:    str r3, [r4, #28]
+; LINUX-NEXT:    mov r3, lr
+; LINUX-NEXT:    bl ldexpl
+; LINUX-NEXT:    stm r4, {r0, r1, r2, r3}
+; LINUX-NEXT:    add sp, sp, #8
+; LINUX-NEXT:    vpop {d8}
+; LINUX-NEXT:    pop {r4, r5, r6, pc}
+  %call = tail call <2 x fp128> @llvm.ldexp.v2f128.v2i32(<2 x fp128> %val, <2 x i32> %a)
+  ret <2 x fp128> %call
+}
diff --git a/llvm/test/CodeGen/LoongArch/ldptr.ll b/llvm/test/CodeGen/LoongArch/ldptr.ll
index c3656a6..9bafa10 100644
--- a/llvm/test/CodeGen/LoongArch/ldptr.ll
+++ b/llvm/test/CodeGen/LoongArch/ldptr.ll
@@ -24,8 +24,7 @@ define signext i32 @ldptr_w(ptr %p) nounwind {
 ; LA32-LABEL: ldptr_w:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    addi.w $a0, $a0, 2047
-; LA32-NEXT:    addi.w $a0, $a0, 1
-; LA32-NEXT:    ld.w $a0, $a0, 0
+; LA32-NEXT:    ld.w $a0, $a0, 1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: ldptr_w:
@@ -81,10 +80,9 @@ entry:
 define i64 @ldptr_d(ptr %p) nounwind {
 ; LA32-LABEL: ldptr_d:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    addi.w $a0, $a0, 2047
-; LA32-NEXT:    addi.w $a1, $a0, 1
-; LA32-NEXT:    ld.w $a0, $a1, 0
-; LA32-NEXT:    ld.w $a1, $a1, 4
+; LA32-NEXT:    addi.w $a1, $a0, 2047
+; LA32-NEXT:    ld.w $a0, $a1, 1
+; LA32-NEXT:    ld.w $a1, $a1, 5
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: ldptr_d:
diff --git a/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll
index 9a806a1..93f73e5 100644
--- a/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll
+++ b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll
@@ -25,14 +25,13 @@ define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    move $s1, $a2
 ; LA32-NEXT:    slli.w $a1, $a0, 4
 ; LA32-NEXT:    alsl.w $a0, $a0, $a1, 3
-; LA32-NEXT:    add.w $a0, $a4, $a0
 ; LA32-NEXT:    sltui $a1, $a3, 1
 ; LA32-NEXT:    slti $a2, $a3, 0
 ; LA32-NEXT:    masknez $a2, $a2, $a1
 ; LA32-NEXT:    sltui $a3, $s1, 1
 ; LA32-NEXT:    maskeqz $a1, $a3, $a1
 ; LA32-NEXT:    or $a1, $a1, $a2
-; LA32-NEXT:    addi.w $s2, $a0, 8
+; LA32-NEXT:    add.w $s2, $a4, $a0
 ; LA32-NEXT:    bnez $a1, .LBB0_3
 ; LA32-NEXT:  # %bb.1: # %for.body.preheader
 ; LA32-NEXT:    move $fp, $a4
@@ -45,8 +44,8 @@ define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; LA32-NEXT:    move $a0, $fp
 ; LA32-NEXT:    bl f
-; LA32-NEXT:    ld.w $a0, $s2, 4
-; LA32-NEXT:    ld.w $a1, $s2, 0
+; LA32-NEXT:    ld.w $a0, $s2, 12
+; LA32-NEXT:    ld.w $a1, $s2, 8
 ; LA32-NEXT:    add.w $a0, $a0, $s6
 ; LA32-NEXT:    add.w $s3, $a1, $s3
 ; LA32-NEXT:    sltu $a1, $s3, $a1
@@ -63,8 +62,8 @@ define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    move $s3, $zero
 ; LA32-NEXT:    move $s6, $zero
 ; LA32-NEXT:  .LBB0_4: # %for.cond.cleanup
-; LA32-NEXT:    st.w $s3, $s2, 0
-; LA32-NEXT:    st.w $s6, $s2, 4
+; LA32-NEXT:    st.w $s3, $s2, 8
+; LA32-NEXT:    st.w $s6, $s2, 12
 ; LA32-NEXT:    ld.w $s6, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s5, $sp, 16 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s4, $sp, 20 # 4-byte Folded Reload
@@ -88,8 +87,7 @@ define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    move $s0, $a1
 ; LA64-NEXT:    slli.d $a1, $a0, 4
 ; LA64-NEXT:    alsl.d $a0, $a0, $a1, 3
-; LA64-NEXT:    add.d $a0, $a2, $a0
-; LA64-NEXT:    addi.d $s1, $a0, 8
+; LA64-NEXT:    add.d $s1, $a2, $a0
 ; LA64-NEXT:    blez $s0, .LBB0_3
 ; LA64-NEXT:  # %bb.1: # %for.body.preheader
 ; LA64-NEXT:    move $fp, $a2
@@ -100,7 +98,7 @@ define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    move $a0, $fp
 ; LA64-NEXT:    pcaddu18i $ra, %call36(f)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    ld.d $a0, $s1, 0
+; LA64-NEXT:    ld.d $a0, $s1, 8
 ; LA64-NEXT:    addi.d $s0, $s0, -1
 ; LA64-NEXT:    add.d $s2, $a0, $s2
 ; LA64-NEXT:    bnez $s0, .LBB0_2
@@ -108,7 +106,7 @@ define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:  .LBB0_3:
 ; LA64-NEXT:    move $s2, $zero
 ; LA64-NEXT:  .LBB0_4: # %for.cond.cleanup
-; LA64-NEXT:    st.d $s2, $s1, 0
+; LA64-NEXT:    st.d $s2, $s1, 8
 ; LA64-NEXT:    ld.d $s2, $sp, 8 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
@@ -153,14 +151,13 @@ define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    move $s1, $a2
 ; LA32-NEXT:    slli.w $a1, $a0, 4
 ; LA32-NEXT:    alsl.w $a0, $a0, $a1, 3
-; LA32-NEXT:    add.w $a0, $a4, $a0
 ; LA32-NEXT:    sltui $a1, $a3, 1
 ; LA32-NEXT:    slti $a2, $a3, 0
 ; LA32-NEXT:    masknez $a2, $a2, $a1
 ; LA32-NEXT:    sltui $a3, $s1, 1
 ; LA32-NEXT:    maskeqz $a1, $a3, $a1
 ; LA32-NEXT:    or $a1, $a1, $a2
-; LA32-NEXT:    addi.w $s2, $a0, 16
+; LA32-NEXT:    add.w $s2, $a4, $a0
 ; LA32-NEXT:    bnez $a1, .LBB1_3
 ; LA32-NEXT:  # %bb.1: # %for.body.preheader
 ; LA32-NEXT:    move $fp, $a4
@@ -172,7 +169,7 @@ define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; LA32-NEXT:    move $a0, $fp
 ; LA32-NEXT:    bl f
-; LA32-NEXT:    fld.s $fa0, $s2, 0
+; LA32-NEXT:    fld.s $fa0, $s2, 16
 ; LA32-NEXT:    addi.w $s3, $s3, 1
 ; LA32-NEXT:    sltui $a0, $s3, 1
 ; LA32-NEXT:    add.w $s4, $s4, $a0
@@ -185,7 +182,7 @@ define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:  .LBB1_3:
 ; LA32-NEXT:    movgr2fr.w $fs0, $zero
 ; LA32-NEXT:  .LBB1_4: # %for.cond.cleanup
-; LA32-NEXT:    fst.s $fs0, $s2, 0
+; LA32-NEXT:    fst.s $fs0, $s2, 16
 ; LA32-NEXT:    fld.d $fs0, $sp, 8 # 8-byte Folded Reload
 ; LA32-NEXT:    ld.w $s4, $sp, 20 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s3, $sp, 24 # 4-byte Folded Reload
@@ -208,8 +205,7 @@ define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    move $s0, $a1
 ; LA64-NEXT:    slli.d $a1, $a0, 4
 ; LA64-NEXT:    alsl.d $a0, $a0, $a1, 3
-; LA64-NEXT:    add.d $a0, $a2, $a0
-; LA64-NEXT:    addi.d $s1, $a0, 16
+; LA64-NEXT:    add.d $s1, $a2, $a0
 ; LA64-NEXT:    blez $s0, .LBB1_3
 ; LA64-NEXT:  # %bb.1: # %for.body.preheader
 ; LA64-NEXT:    move $fp, $a2
@@ -220,7 +216,7 @@ define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    move $a0, $fp
 ; LA64-NEXT:    pcaddu18i $ra, %call36(f)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fld.s $fa0, $s1, 0
+; LA64-NEXT:    fld.s $fa0, $s1, 16
 ; LA64-NEXT:    addi.d $s0, $s0, -1
 ; LA64-NEXT:    fadd.s $fs0, $fa0, $fs0
 ; LA64-NEXT:    bnez $s0, .LBB1_2
@@ -228,7 +224,7 @@ define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:  .LBB1_3:
 ; LA64-NEXT:    movgr2fr.w $fs0, $zero
 ; LA64-NEXT:  .LBB1_4: # %for.cond.cleanup
-; LA64-NEXT:    fst.s $fs0, $s1, 0
+; LA64-NEXT:    fst.s $fs0, $s1, 16
 ; LA64-NEXT:    fld.d $fs0, $sp, 8 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
@@ -271,14 +267,13 @@ define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    move $s0, $a3
 ; LA32-NEXT:    move $s1, $a2
 ; LA32-NEXT:    slli.w $a0, $a0, 6
-; LA32-NEXT:    add.w $a0, $a4, $a0
 ; LA32-NEXT:    sltui $a1, $a3, 1
 ; LA32-NEXT:    slti $a2, $a3, 0
 ; LA32-NEXT:    masknez $a2, $a2, $a1
 ; LA32-NEXT:    sltui $a3, $s1, 1
 ; LA32-NEXT:    maskeqz $a1, $a3, $a1
 ; LA32-NEXT:    or $a1, $a1, $a2
-; LA32-NEXT:    addi.w $s2, $a0, 16
+; LA32-NEXT:    add.w $s2, $a4, $a0
 ; LA32-NEXT:    bnez $a1, .LBB2_3
 ; LA32-NEXT:  # %bb.1: # %for.body.preheader
 ; LA32-NEXT:    move $fp, $a4
@@ -291,7 +286,7 @@ define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
 ; LA32-NEXT:    move $a0, $fp
 ; LA32-NEXT:    bl f
-; LA32-NEXT:    vld $vr0, $s2, 0
+; LA32-NEXT:    vld $vr0, $s2, 16
 ; LA32-NEXT:    addi.w $s3, $s3, 1
 ; LA32-NEXT:    sltui $a0, $s3, 1
 ; LA32-NEXT:    add.w $s4, $s4, $a0
@@ -307,7 +302,7 @@ define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:  .LBB2_3:
 ; LA32-NEXT:    vrepli.b $vr0, 0
 ; LA32-NEXT:  .LBB2_4: # %for.cond.cleanup
-; LA32-NEXT:    vst $vr0, $s2, 0
+; LA32-NEXT:    vst $vr0, $s2, 16
 ; LA32-NEXT:    ld.w $s4, $sp, 20 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s3, $sp, 24 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s2, $sp, 28 # 4-byte Folded Reload
@@ -326,8 +321,7 @@ define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
 ; LA64-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
 ; LA64-NEXT:    slli.d $a0, $a0, 6
-; LA64-NEXT:    add.d $a0, $a2, $a0
-; LA64-NEXT:    addi.d $s1, $a0, 16
+; LA64-NEXT:    add.d $s1, $a2, $a0
 ; LA64-NEXT:    blez $a1, .LBB2_3
 ; LA64-NEXT:  # %bb.1: # %for.body.preheader
 ; LA64-NEXT:    move $fp, $a2
@@ -340,7 +334,7 @@ define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    move $a0, $fp
 ; LA64-NEXT:    pcaddu18i $ra, %call36(f)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    vld $vr0, $s1, 0
+; LA64-NEXT:    vld $vr0, $s1, 16
 ; LA64-NEXT:    addi.d $s0, $s0, -1
 ; LA64-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
 ; LA64-NEXT:    vadd.w $vr1, $vr0, $vr1
@@ -351,7 +345,7 @@ define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:  .LBB2_3:
 ; LA64-NEXT:    vrepli.b $vr0, 0
 ; LA64-NEXT:  .LBB2_4: # %for.cond.cleanup
-; LA64-NEXT:    vst $vr0, $s1, 0
+; LA64-NEXT:    vst $vr0, $s1, 16
 ; LA64-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
@@ -393,14 +387,13 @@ define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    move $s0, $a3
 ; LA32-NEXT:    move $s1, $a2
 ; LA32-NEXT:    slli.w $a0, $a0, 6
-; LA32-NEXT:    add.w $a0, $a4, $a0
 ; LA32-NEXT:    sltui $a1, $a3, 1
 ; LA32-NEXT:    slti $a2, $a3, 0
 ; LA32-NEXT:    masknez $a2, $a2, $a1
 ; LA32-NEXT:    sltui $a3, $s1, 1
 ; LA32-NEXT:    maskeqz $a1, $a3, $a1
 ; LA32-NEXT:    or $a1, $a1, $a2
-; LA32-NEXT:    addi.w $s2, $a0, 32
+; LA32-NEXT:    add.w $s2, $a4, $a0
 ; LA32-NEXT:    bnez $a1, .LBB3_3
 ; LA32-NEXT:  # %bb.1: # %for.body.preheader
 ; LA32-NEXT:    move $fp, $a4
@@ -413,7 +406,7 @@ define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
 ; LA32-NEXT:    move $a0, $fp
 ; LA32-NEXT:    bl f
-; LA32-NEXT:    xvld $xr0, $s2, 0
+; LA32-NEXT:    xvld $xr0, $s2, 32
 ; LA32-NEXT:    addi.w $s3, $s3, 1
 ; LA32-NEXT:    sltui $a0, $s3, 1
 ; LA32-NEXT:    add.w $s4, $s4, $a0
@@ -429,7 +422,7 @@ define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:  .LBB3_3:
 ; LA32-NEXT:    xvrepli.b $xr0, 0
 ; LA32-NEXT:  .LBB3_4: # %for.cond.cleanup
-; LA32-NEXT:    xvst $xr0, $s2, 0
+; LA32-NEXT:    xvst $xr0, $s2, 32
 ; LA32-NEXT:    ld.w $s4, $sp, 52 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s3, $sp, 56 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s2, $sp, 60 # 4-byte Folded Reload
@@ -448,8 +441,7 @@ define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
 ; LA64-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
 ; LA64-NEXT:    slli.d $a0, $a0, 6
-; LA64-NEXT:    add.d $a0, $a2, $a0
-; LA64-NEXT:    addi.d $s1, $a0, 32
+; LA64-NEXT:    add.d $s1, $a2, $a0
 ; LA64-NEXT:    blez $a1, .LBB3_3
 ; LA64-NEXT:  # %bb.1: # %for.body.preheader
 ; LA64-NEXT:    move $fp, $a2
@@ -462,7 +454,7 @@ define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    move $a0, $fp
 ; LA64-NEXT:    pcaddu18i $ra, %call36(f)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    xvld $xr0, $s1, 0
+; LA64-NEXT:    xvld $xr0, $s1, 32
 ; LA64-NEXT:    addi.d $s0, $s0, -1
 ; LA64-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
 ; LA64-NEXT:    xvadd.h $xr1, $xr0, $xr1
@@ -473,7 +465,7 @@ define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:  .LBB3_3:
 ; LA64-NEXT:    xvrepli.b $xr0, 0
 ; LA64-NEXT:  .LBB3_4: # %for.cond.cleanup
-; LA64-NEXT:    xvst $xr0, $s1, 0
+; LA64-NEXT:    xvst $xr0, $s1, 32
 ; LA64-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
@@ -516,14 +508,13 @@ define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    move $s1, $a2
 ; LA32-NEXT:    slli.w $a1, $a0, 4
 ; LA32-NEXT:    alsl.w $a0, $a0, $a1, 3
-; LA32-NEXT:    add.w $a0, $a4, $a0
 ; LA32-NEXT:    sltui $a1, $a3, 1
 ; LA32-NEXT:    slti $a2, $a3, 0
 ; LA32-NEXT:    masknez $a2, $a2, $a1
 ; LA32-NEXT:    sltui $a3, $s1, 1
 ; LA32-NEXT:    maskeqz $a1, $a3, $a1
 ; LA32-NEXT:    or $a1, $a1, $a2
-; LA32-NEXT:    addi.w $s2, $a0, 16
+; LA32-NEXT:    add.w $s2, $a4, $a0
 ; LA32-NEXT:    bnez $a1, .LBB4_3
 ; LA32-NEXT:  # %bb.1: # %for.body.preheader
 ; LA32-NEXT:    move $fp, $a4
@@ -536,7 +527,7 @@ define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
 ; LA32-NEXT:    move $a0, $fp
 ; LA32-NEXT:    bl f
-; LA32-NEXT:    vldrepl.b $vr0, $s2, 0
+; LA32-NEXT:    vldrepl.b $vr0, $s2, 16
 ; LA32-NEXT:    addi.w $s3, $s3, 1
 ; LA32-NEXT:    sltui $a0, $s3, 1
 ; LA32-NEXT:    add.w $s4, $s4, $a0
@@ -552,7 +543,7 @@ define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:  .LBB4_3:
 ; LA32-NEXT:    vrepli.b $vr0, 0
 ; LA32-NEXT:  .LBB4_4: # %for.cond.cleanup
-; LA32-NEXT:    vstelm.b $vr0, $s2, 0, 1
+; LA32-NEXT:    vstelm.b $vr0, $s2, 16, 1
 ; LA32-NEXT:    ld.w $s4, $sp, 20 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s3, $sp, 24 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s2, $sp, 28 # 4-byte Folded Reload
@@ -573,8 +564,7 @@ define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    move $s0, $a1
 ; LA64-NEXT:    slli.d $a1, $a0, 4
 ; LA64-NEXT:    alsl.d $a0, $a0, $a1, 3
-; LA64-NEXT:    add.d $a0, $a2, $a0
-; LA64-NEXT:    addi.d $s1, $a0, 16
+; LA64-NEXT:    add.d $s1, $a2, $a0
 ; LA64-NEXT:    blez $s0, .LBB4_3
 ; LA64-NEXT:  # %bb.1: # %for.body.preheader
 ; LA64-NEXT:    move $fp, $a2
@@ -586,7 +576,7 @@ define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    move $a0, $fp
 ; LA64-NEXT:    pcaddu18i $ra, %call36(f)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    vldrepl.b $vr0, $s1, 0
+; LA64-NEXT:    vldrepl.b $vr0, $s1, 16
 ; LA64-NEXT:    addi.d $s0, $s0, -1
 ; LA64-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
 ; LA64-NEXT:    vadd.b $vr1, $vr0, $vr1
@@ -597,7 +587,7 @@ define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:  .LBB4_3:
 ; LA64-NEXT:    vrepli.b $vr0, 0
 ; LA64-NEXT:  .LBB4_4: # %for.cond.cleanup
-; LA64-NEXT:    vstelm.b $vr0, $s1, 0, 1
+; LA64-NEXT:    vstelm.b $vr0, $s1, 16, 1
 ; LA64-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
@@ -643,14 +633,13 @@ define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    move $s1, $a2
 ; LA32-NEXT:    slli.w $a1, $a0, 4
 ; LA32-NEXT:    alsl.w $a0, $a0, $a1, 3
-; LA32-NEXT:    add.w $a0, $a4, $a0
 ; LA32-NEXT:    sltui $a1, $a3, 1
 ; LA32-NEXT:    slti $a2, $a3, 0
 ; LA32-NEXT:    masknez $a2, $a2, $a1
 ; LA32-NEXT:    sltui $a3, $s1, 1
 ; LA32-NEXT:    maskeqz $a1, $a3, $a1
 ; LA32-NEXT:    or $a1, $a1, $a2
-; LA32-NEXT:    addi.w $s2, $a0, 8
+; LA32-NEXT:    add.w $s2, $a4, $a0
 ; LA32-NEXT:    bnez $a1, .LBB5_3
 ; LA32-NEXT:  # %bb.1: # %for.body.preheader
 ; LA32-NEXT:    move $fp, $a4
@@ -663,7 +652,7 @@ define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
 ; LA32-NEXT:    move $a0, $fp
 ; LA32-NEXT:    bl f
-; LA32-NEXT:    xvldrepl.d $xr0, $s2, 0
+; LA32-NEXT:    xvldrepl.d $xr0, $s2, 8
 ; LA32-NEXT:    addi.w $s3, $s3, 1
 ; LA32-NEXT:    sltui $a0, $s3, 1
 ; LA32-NEXT:    add.w $s4, $s4, $a0
@@ -679,7 +668,7 @@ define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:  .LBB5_3:
 ; LA32-NEXT:    xvrepli.b $xr0, 0
 ; LA32-NEXT:  .LBB5_4: # %for.cond.cleanup
-; LA32-NEXT:    xvstelm.d $xr0, $s2, 0, 1
+; LA32-NEXT:    xvstelm.d $xr0, $s2, 8, 1
 ; LA32-NEXT:    ld.w $s4, $sp, 52 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s3, $sp, 56 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s2, $sp, 60 # 4-byte Folded Reload
@@ -700,8 +689,7 @@ define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    move $s0, $a1
 ; LA64-NEXT:    slli.d $a1, $a0, 4
 ; LA64-NEXT:    alsl.d $a0, $a0, $a1, 3
-; LA64-NEXT:    add.d $a0, $a2, $a0
-; LA64-NEXT:    addi.d $s1, $a0, 8
+; LA64-NEXT:    add.d $s1, $a2, $a0
 ; LA64-NEXT:    blez $s0, .LBB5_3
 ; LA64-NEXT:  # %bb.1: # %for.body.preheader
 ; LA64-NEXT:    move $fp, $a2
@@ -713,7 +701,7 @@ define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    move $a0, $fp
 ; LA64-NEXT:    pcaddu18i $ra, %call36(f)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    xvldrepl.d $xr0, $s1, 0
+; LA64-NEXT:    xvldrepl.d $xr0, $s1, 8
 ; LA64-NEXT:    addi.d $s0, $s0, -1
 ; LA64-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
 ; LA64-NEXT:    xvfadd.d $xr1, $xr0, $xr1
@@ -724,7 +712,7 @@ define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:  .LBB5_3:
 ; LA64-NEXT:    xvrepli.b $xr0, 0
 ; LA64-NEXT:  .LBB5_4: # %for.cond.cleanup
-; LA64-NEXT:    xvstelm.d $xr0, $s1, 0, 1
+; LA64-NEXT:    xvstelm.d $xr0, $s1, 8, 1
 ; LA64-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/LoongArch/stptr.ll b/llvm/test/CodeGen/LoongArch/stptr.ll
index d70f9f4..23b433a 100644
--- a/llvm/test/CodeGen/LoongArch/stptr.ll
+++ b/llvm/test/CodeGen/LoongArch/stptr.ll
@@ -23,8 +23,7 @@ define void @stptr_w(ptr %p, i32 signext %val) nounwind {
 ; LA32-LABEL: stptr_w:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    addi.w $a0, $a0, 2047
-; LA32-NEXT:    addi.w $a0, $a0, 1
-; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    st.w $a1, $a0, 1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: stptr_w:
@@ -77,9 +76,8 @@ define void @stptr_d(ptr %p, i64 %val) nounwind {
 ; LA32-LABEL: stptr_d:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    addi.w $a0, $a0, 2047
-; LA32-NEXT:    addi.w $a0, $a0, 1
-; LA32-NEXT:    st.w $a2, $a0, 4
-; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    st.w $a2, $a0, 5
+; LA32-NEXT:    st.w $a1, $a0, 1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: stptr_d:
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
index 50bd22b..f4964288 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
@@ -205,12 +205,19 @@ define i64 @addmul20(i64 %a, i64 %b) {
 }
 
 define i64 @addmul22(i64 %a, i64 %b) {
-; CHECK-LABEL: addmul22:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 22
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: addmul22:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 22
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: addmul22:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a2, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 1
+; RV64XTHEADBA-NEXT:    ret
   %c = mul i64 %a, 22
   %d = add i64 %c, %b
   ret i64 %d
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index 7fd7626..d4b2288 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -585,6 +585,33 @@ define i64 @addmul12(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @addmul14(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul14:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a2, a0, 1
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul14:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh1add a2, a0, a0
+; RV64ZBA-NEXT:    sh1add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul14:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.h a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 14
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
 define i64 @addmul18(i64 %a, i64 %b) {
 ; RV64I-LABEL: addmul18:
 ; RV64I:       # %bb.0:
@@ -636,12 +663,26 @@ define i64 @addmul20(i64 %a, i64 %b) {
 }
 
 define i64 @addmul22(i64 %a, i64 %b) {
-; CHECK-LABEL: addmul22:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 22
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: addmul22:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 22
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul22:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add a2, a0, a0
+; RV64ZBA-NEXT:    sh1add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul22:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.w a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
   %c = mul i64 %a, 22
   %d = add i64 %c, %b
   ret i64 %d
@@ -672,6 +713,32 @@ define i64 @addmul24(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @addmul26(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul26:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 26
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul26:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh1add a2, a0, a0
+; RV64ZBA-NEXT:    sh2add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul26:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.h a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.w a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 26
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
 define i64 @addmul36(i64 %a, i64 %b) {
 ; RV64I-LABEL: addmul36:
 ; RV64I:       # %bb.0:
@@ -722,6 +789,58 @@ define i64 @addmul40(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @addmul38(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul38:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 38
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul38:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a2, a0, a0
+; RV64ZBA-NEXT:    sh1add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul38:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.d a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 38
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @addmul42(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul42:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 42
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul42:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add a2, a0, a0
+; RV64ZBA-NEXT:    sh2add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul42:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.w a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.w a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 42
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
 define i64 @addmul72(i64 %a, i64 %b) {
 ; RV64I-LABEL: addmul72:
 ; RV64I:       # %bb.0:
@@ -747,6 +866,84 @@ define i64 @addmul72(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @addmul74(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul74:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 74
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul74:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a2, a0, a0
+; RV64ZBA-NEXT:    sh2add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul74:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.d a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.w a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 74
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @addmul82(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul82:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 82
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul82:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add a2, a0, a0
+; RV64ZBA-NEXT:    sh3add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul82:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.w a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.d a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 82
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @addmul146(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul146:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 146
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul146:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a2, a0, a0
+; RV64ZBA-NEXT:    sh3add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul146:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.d a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.d a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 146
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
 define i64 @mul50(i64 %a) {
 ; RV64I-LABEL: mul50:
 ; RV64I:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/zicond-opts.ll b/llvm/test/CodeGen/RISCV/zicond-opts.ll
index d8e2b2c..305ab93 100644
--- a/llvm/test/CodeGen/RISCV/zicond-opts.ll
+++ b/llvm/test/CodeGen/RISCV/zicond-opts.ll
@@ -263,3 +263,35 @@ define i64 @test_inv_and_eqz(i64 %f, i64 %x, i1 %cond) {
   %7 = and i64 %6, %f
   ret i64 %7
 }
+
+define i32 @pr166596(i32 %conv.i, i1 %iszero) #0 {
+; RV32ZICOND-LABEL: pr166596:
+; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND-NEXT:    andi a1, a1, 1
+; RV32ZICOND-NEXT:    xori a0, a0, 1
+; RV32ZICOND-NEXT:    zext.h a0, a0
+; RV32ZICOND-NEXT:    clz a0, a0
+; RV32ZICOND-NEXT:    addi a0, a0, 41
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a1
+; RV32ZICOND-NEXT:    addi a0, a0, -9
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: pr166596:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    andi a1, a1, 1
+; RV64ZICOND-NEXT:    xori a0, a0, 1
+; RV64ZICOND-NEXT:    zext.h a0, a0
+; RV64ZICOND-NEXT:    clz a0, a0
+; RV64ZICOND-NEXT:    addi a0, a0, 9
+; RV64ZICOND-NEXT:    czero.nez a0, a0, a1
+; RV64ZICOND-NEXT:    addi a0, a0, -9
+; RV64ZICOND-NEXT:    ret
+entry:
+  %not.i = xor i32 %conv.i, 1
+  %conv2.i = trunc i32 %not.i to i16
+  %conv22 = zext i16 %conv2.i to i64
+  %0 = call i64 @llvm.ctlz.i64(i64 %conv22, i1 false)
+  %cast = trunc i64 %0 to i32
+  %clzg = select i1 %iszero, i32 -9, i32 %cast
+  ret i32 %clzg
+}
diff --git a/llvm/test/CodeGen/SPIRV/allow_unknown_intrinsics.ll b/llvm/test/CodeGen/SPIRV/allow_unknown_intrinsics.ll
new file mode 100644
index 0000000..677291a
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/allow_unknown_intrinsics.ll
@@ -0,0 +1,36 @@
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck -check-prefix=CHECK-ERROR %s
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics %s -o %t.spvt 2>&1 | FileCheck -check-prefix=CHECK-ERROR %s
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=notllvm %s -o %t.spvt 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm.some.custom %s -o %t.spvt 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm. %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm.,random.prefix %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm. %s -o - -filetype=obj | spirv-val %}
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - -filetype=obj | spirv-val %}
+
+; The test checks command-line option which allows to represent unknown
+; intrinsics as external function calls in SPIR-V.
+
+; CHECK-ERROR: LLVM ERROR: unable to legalize instruction: %3:iid(s64) = G_READCYCLECOUNTER (in function: foo)
+
+; CHECK: Name %[[READCYCLECOUNTER:[0-9]+]] "spirv.llvm_readcyclecounter"
+; CHECK: Name %[[SOME_CUSTOM_INTRINSIC:[0-9]+]] "spirv.llvm_some_custom_intrinsic"
+; CHECK-DAG: Decorate %[[READCYCLECOUNTER]] LinkageAttributes {{.*}} Import
+; CHECK: Decorate %[[SOME_CUSTOM_INTRINSIC]] LinkageAttributes {{.*}} Import
+; CHECK-DAG: %[[I64:[0-9]+]] = OpTypeInt 64
+; CHECK: %[[FnTy:[0-9]+]] = OpTypeFunction %[[I64]]
+; CHECK: %[[READCYCLECOUNTER]] = OpFunction %[[I64]] {{.*}} %[[FnTy]]
+; CHECK-DAG: %[[SOME_CUSTOM_INTRINSIC]] = OpFunction %[[I64]] {{.*}} %[[FnTy]]
+; CHECK-DAG: OpFunctionCall %[[I64]] %[[READCYCLECOUNTER]]
+; CHECK:     OpFunctionCall %[[I64]] %[[SOME_CUSTOM_INTRINSIC]]
+
+define spir_func void @foo() {
+entry:
+; TODO: if and when the SPIR-V learns how to lower readcyclecounter, we will have to pick another unhandled intrinsic
+  %0 = call i64 @llvm.readcyclecounter()
+  %1 = call i64 @llvm.some.custom.intrinsic()
+  ret void
+}
+
+declare i64 @llvm.readcyclecounter()
+declare i64 @llvm.some.custom.intrinsic()
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_blocking_pipes/PipeBlocking.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_blocking_pipes/PipeBlocking.ll
new file mode 100644
index 0000000..f6b6115
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_blocking_pipes/PipeBlocking.ll
@@ -0,0 +1,98 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_ALTERA_blocking_pipes %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_blocking_pipes %s -o - -filetype=obj | spirv-val %}
+
+%opencl.pipe_ro_t = type opaque
+%opencl.pipe_wo_t = type opaque
+
+; CHECK-SPIRV: OpCapability BlockingPipesALTERA
+; CHECK-SPIRV: OpExtension "SPV_ALTERA_blocking_pipes"
+; CHECK-SPIRV: %[[PipeRTy:[0-9]+]] = OpTypePipe ReadOnly
+; CHECK-SPIRV: %[[PipeWTy:[0-9]+]] = OpTypePipe WriteOnly
+; CHECK-SPIRV: %[[PipeR1:[0-9]+]] = OpLoad %[[PipeRTy]] %[[#]] Aligned 8
+; CHECK-SPIRV: OpReadPipeBlockingALTERA %[[PipeR1]] %[[#]] %[[#]] %[[#]]
+; CHECK-SPIRV: %[[PipeR2:[0-9]+]] = OpLoad %[[PipeRTy]] %[[#]] Aligned 8
+; CHECK-SPIRV: OpReadPipeBlockingALTERA %[[PipeR2]] %[[#]]  %[[#]] %[[#]]
+; CHECK-SPIRV: %[[PipeW1:[0-9]+]] = OpLoad %[[PipeWTy]] %[[#]] Aligned 8
+; CHECK-SPIRV: OpWritePipeBlockingALTERA %[[PipeW1]] %[[#]]  %[[#]] %[[#]]
+; CHECK-SPIRV: %[[PipeW2:[0-9]+]] = OpLoad %[[PipeWTy]] %[[#]] Aligned 8
+; CHECK-SPIRV: OpWritePipeBlockingALTERA %[[PipeW2]] %[[#]] %[[#]] %[[#]]
+
+define spir_func void @foo(target("spirv.Pipe", 0) %p, ptr addrspace(1) %ptr) {
+entry:
+  %p.addr = alloca target("spirv.Pipe", 0), align 8
+  %ptr.addr = alloca ptr addrspace(1), align 8
+  store target("spirv.Pipe", 0) %p, target("spirv.Pipe", 0)* %p.addr, align 8
+  store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8
+  %0 = load target("spirv.Pipe", 0), target("spirv.Pipe", 0)* %p.addr, align 8
+  %1 = load ptr addrspace(1), ptr %ptr.addr, align 8
+  %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4)
+  call spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePiii(target("spirv.Pipe", 0) %0, ptr addrspace(4) %2, i32 4, i32 4)
+  ret void
+}
+
+declare dso_local spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePiii(target("spirv.Pipe", 0), ptr addrspace(4), i32, i32)
+
+define spir_func void @bar(target("spirv.Pipe", 0) %p, ptr addrspace(1) %ptr) {
+entry:
+  %p.addr = alloca target("spirv.Pipe", 0), align 8
+  %ptr.addr = alloca ptr addrspace(1), align 8
+  store target("spirv.Pipe", 0) %p, target("spirv.Pipe", 0)* %p.addr, align 8
+  store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8
+  %0 = load target("spirv.Pipe", 0), target("spirv.Pipe", 0)* %p.addr, align 8
+  %1 = load ptr addrspace(1), ptr %ptr.addr, align 8
+  %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4)
+  call spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePvii(target("spirv.Pipe", 0) %0, ptr addrspace(4) %2, i32 4, i32 4)
+  ret void
+}
+
+declare dso_local spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePvii(target("spirv.Pipe", 0), ptr addrspace(4), i32, i32)
+
+define spir_func void @boo(target("spirv.Pipe", 1) %p, ptr addrspace(1) %ptr) {
+entry:
+  %p.addr = alloca target("spirv.Pipe", 1), align 8
+  %ptr.addr = alloca ptr addrspace(1), align 8
+  store target("spirv.Pipe", 1) %p, target("spirv.Pipe", 1)* %p.addr, align 8
+  store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8
+  %0 = load target("spirv.Pipe", 1), target("spirv.Pipe", 1)* %p.addr, align 8
+  %1 = load ptr addrspace(1), ptr %ptr.addr, align 8
+  %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4)
+  call spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePiii(target("spirv.Pipe", 1) %0, ptr addrspace(4) %2, i32 4, i32 4)
+  ret void
+}
+
+declare dso_local spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePiii(target("spirv.Pipe", 1), ptr addrspace(4), i32, i32)
+
+define spir_func void @baz(target("spirv.Pipe", 1) %p, ptr addrspace(1) %ptr) {
+entry:
+  %p.addr = alloca target("spirv.Pipe", 1), align 8
+  %ptr.addr = alloca ptr addrspace(1), align 8
+  store target("spirv.Pipe", 1) %p, target("spirv.Pipe", 1)* %p.addr, align 8
+  store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8
+  %0 = load target("spirv.Pipe", 1), target("spirv.Pipe", 1)* %p.addr, align 8
+  %1 = load ptr addrspace(1), ptr %ptr.addr, align 8
+  %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4)
+  call spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePvii(target("spirv.Pipe", 1) %0, ptr addrspace(4) %2, i32 4, i32 4)
+  ret void
+}
+
+declare dso_local spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePvii(target("spirv.Pipe", 1), ptr addrspace(4), i32, i32)
+
+; CHECK-LLVM: declare spir_func void @__read_pipe_2_bl(ptr addrspace(1), ptr addrspace(4), i32, i32)
+; CHECK-LLVM: declare spir_func void @__write_pipe_2_bl(ptr addrspace(1), ptr addrspace(4), i32, i32)
+
+define linkonce_odr dso_local spir_func void @WritePipeBLockingi9Pointer(ptr addrspace(4) align 2 dereferenceable(2) %_Data) {
+entry:
+  %_Data.addr = alloca ptr addrspace(4), align 8
+  %_WPipe = alloca target("spirv.Pipe", 1), align 8
+  %_Data.addr.ascast = addrspacecast ptr %_Data.addr to ptr addrspace(4)
+  %_WPipe.ascast = addrspacecast target("spirv.Pipe", 1)* %_WPipe to target("spirv.Pipe", 1) addrspace(4)*
+  store ptr addrspace(4) %_Data, ptr addrspace(4) %_Data.addr.ascast, align 8
+  %0 = bitcast target("spirv.Pipe", 1)* %_WPipe to ptr
+  %1 = load target("spirv.Pipe", 1), target("spirv.Pipe", 1) addrspace(4)* %_WPipe.ascast, align 8
+  %2 = load ptr addrspace(4), ptr addrspace(4) %_Data.addr.ascast, align 8
+  call spir_func void @_Z30__spirv_WritePipeBlockingINTELIDU9_Ev8ocl_pipePKT_ii(target("spirv.Pipe", 1) %1, ptr addrspace(4) %2, i32 2, i32 2)
+  ret void
+}
+
+declare dso_local spir_func void @_Z30__spirv_WritePipeBlockingINTELIDU9_Ev8ocl_pipePKT_ii(target("spirv.Pipe", 1), ptr addrspace(4), i32, i32)
+ 
+\ No newline at end of file
diff --git a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
index a8d37be..c44b3bb 100644
--- a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
@@ -2808,6 +2808,348 @@ entry:
   ret <4 x i32> %spec.store.select7
 }
 
+define <2 x i8> @fptosi_v2f32_v2i8(<2 x float> %x) {
+; CHECK-LABEL: fptosi_v2f32_v2i8:
+; CHECK:         .functype fptosi_v2f32_v2i8 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <2 x float> %x to <2 x i8>
+  ret <2 x i8> %conv
+}
+
+define <2 x i8> @fptoui_v2f32_v2i8(<2 x float> %x) {
+; CHECK-LABEL: fptoui_v2f32_v2i8:
+; CHECK:         .functype fptoui_v2f32_v2i8 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <2 x float> %x to <2 x i8>
+  ret <2 x i8> %conv
+}
+
+define <2 x i16> @fptosi_v2f32_v2i16(<2 x float> %x) {
+; CHECK-LABEL: fptosi_v2f32_v2i16:
+; CHECK:         .functype fptosi_v2f32_v2i16 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <2 x float> %x to <2 x i16>
+  ret <2 x i16> %conv
+}
+
+define <2 x i16> @fptoui_v2f32_v2i16(<2 x float> %x) {
+; CHECK-LABEL: fptoui_v2f32_v2i16:
+; CHECK:         .functype fptoui_v2f32_v2i16 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <2 x float> %x to <2 x i16>
+  ret <2 x i16> %conv
+}
+
+define <4 x i8> @fptosi_v4f32_v4i8(<4 x float> %x) {
+; CHECK-LABEL: fptosi_v4f32_v4i8:
+; CHECK:         .functype fptosi_v4f32_v4i8 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <4 x float> %x to <4 x i8>
+  ret <4 x i8> %conv
+}
+
+define <4 x i8> @fptoui_v4f32_v4i8(<4 x float> %x) {
+; CHECK-LABEL: fptoui_v4f32_v4i8:
+; CHECK:         .functype fptoui_v4f32_v4i8 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <4 x float> %x to <4 x i8>
+  ret <4 x i8> %conv
+}
+
+define <4 x i16> @fptosi_v4f32_v4i16(<4 x float> %x) {
+; CHECK-LABEL: fptosi_v4f32_v4i16:
+; CHECK:         .functype fptosi_v4f32_v4i16 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <4 x float> %x to <4 x i16>
+  ret <4 x i16> %conv
+}
+
+define <4 x i16> @fptoui_v4f32_v4i16(<4 x float> %x) {
+; CHECK-LABEL: fptoui_v4f32_v4i16:
+; CHECK:         .functype fptoui_v4f32_v4i16 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <4 x float> %x to <4 x i16>
+  ret <4 x i16> %conv
+}
+
+define <8 x i8> @fptosi_v8f32_v8i8(<8 x float> %x) {
+; CHECK-LABEL: fptosi_v8f32_v8i8:
+; CHECK:         .functype fptosi_v8f32_v8i8 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <8 x float> %x to <8 x i8>
+  ret <8 x i8> %conv
+}
+
+define <8 x i8> @fptoui_v8f32_v8i8(<8 x float> %x) {
+; CHECK-LABEL: fptoui_v8f32_v8i8:
+; CHECK:         .functype fptoui_v8f32_v8i8 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <8 x float> %x to <8 x i8>
+  ret <8 x i8> %conv
+}
+
+define <8 x i16> @fptosi_v8f32_v8i16(<8 x float> %x) {
+; CHECK-LABEL: fptosi_v8f32_v8i16:
+; CHECK:         .functype fptosi_v8f32_v8i16 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <8 x float> %x to <8 x i16>
+  ret <8 x i16> %conv
+}
+
+define <8 x i16> @fptoui_v8f32_v8i16(<8 x float> %x) {
+; CHECK-LABEL: fptoui_v8f32_v8i16:
+; CHECK:         .functype fptoui_v8f32_v8i16 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <8 x float> %x to <8 x i16>
+  ret <8 x i16> %conv
+}
+
+define <16 x i8> @fptosi_v16f32_v16i8(<16 x float> %x) {
+; CHECK-LABEL: fptosi_v16f32_v16i8:
+; CHECK:         .functype fptosi_v16f32_v16i8 (v128, v128, v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <16 x float> %x to <16 x i8>
+  ret <16 x i8> %conv
+}
+
+define <16 x i8> @fptoui_v16f32_v16i8(<16 x float> %x) {
+; CHECK-LABEL: fptoui_v16f32_v16i8:
+; CHECK:         .functype fptoui_v16f32_v16i8 (v128, v128, v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <16 x float> %x to <16 x i8>
+  ret <16 x i8> %conv
+}
+
+define <16 x i16> @fptosi_v16f32_v16i16(<16 x float> %x) {
+; CHECK-LABEL: fptosi_v16f32_v16i16:
+; CHECK:         .functype fptosi_v16f32_v16i16 (i32, v128, v128, v128, v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    local.tee 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 4
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <16 x float> %x to <16 x i16>
+  ret <16 x i16> %conv
+}
+
+define <16 x i16> @fptoui_v16f32_v16i16(<16 x float> %x) {
+; CHECK-LABEL: fptoui_v16f32_v16i16:
+; CHECK:         .functype fptoui_v16f32_v16i16 (i32, v128, v128, v128, v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    local.tee 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 4
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <16 x float> %x to <16 x i16>
+  ret <16 x i16> %conv
+}
+
 declare <2 x i32> @llvm.smin.v2i32(<2 x i32>, <2 x i32>)
 declare <2 x i32> @llvm.smax.v2i32(<2 x i32>, <2 x i32>)
 declare <2 x i32> @llvm.umin.v2i32(<2 x i32>, <2 x i32>)
diff --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
index 5eb49fd..404db23 100644
--- a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
+++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=wasm32 -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s
+; RUN: opt -mtriple=wasm32 -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20"
 
@@ -20,17 +20,17 @@ target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: i32x4.add
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 ; CHECK: i32x4.add
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
 ; CHECK: v128.store
 define hidden void @two_ints_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -64,17 +64,17 @@ define hidden void @two_ints_same_op(ptr noalias nocapture noundef writeonly %0,
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: i32x4.add
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 ; CHECK: i32x4.sub
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
 ; CHECK: v128.store
 define hidden void @two_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -208,27 +208,27 @@ define hidden void @three_shorts(ptr noalias nocapture noundef writeonly %0, ptr
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i16x8.sub
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i16x8.sub
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i16x8.sub
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i16x8.sub
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_shorts_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -276,27 +276,27 @@ define hidden void @four_shorts_same_op(ptr noalias nocapture noundef writeonly
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.or
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.or
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.xor
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.xor
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_shorts_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -343,27 +343,27 @@ define hidden void @four_shorts_split_op(ptr noalias nocapture noundef writeonly
 ; CHECK-LABEL: four_shorts_interleave_op:
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.or
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.xor
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.or
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.xor
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_shorts_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -483,19 +483,19 @@ define hidden void @five_shorts(ptr noalias nocapture noundef writeonly %0, ptr
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 ; CHECK: i16x8.extmul_high_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
 ; CHECK: i16x8.extmul_high_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
+; CHECK: i8x16.shuffle  0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
 ; CHECK: v128.store
 ; CHECK: i16x8.extmul_low_i8x16_u
 ; CHECK: i16x8.extmul_low_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
+; CHECK: i8x16.shuffle  0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
 ; CHECK: v128.store
 define hidden void @two_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -529,18 +529,18 @@ define hidden void @two_bytes_same_op(ptr noalias nocapture noundef writeonly %0
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 ; CHECK: i16x8.extmul_high_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
 ; CHECK: i8x16.sub
-; CHECK: i8x16.shuffle {{.*}} 0, 24, 2, 25, 4, 26, 6, 27, 8, 28, 10, 29, 12, 30, 14, 31
+; CHECK: i8x16.shuffle  0, 24, 2, 25, 4, 26, 6, 27, 8, 28, 10, 29, 12, 30, 14, 31
 ; CHECK: v128.store
 ; CHECK: i16x8.extmul_low_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 0, 16, 2, 17, 4, 18, 6, 19, 8, 20, 10, 21, 12, 22, 14, 23
+; CHECK: i8x16.shuffle  0, 16, 2, 17, 4, 18, 6, 19, 8, 20, 10, 21, 12, 22, 14, 23
 ; CHECK: v128.store
 define hidden void @two_bytes_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -672,27 +672,27 @@ define hidden void @three_bytes_interleave_op(ptr noalias nocapture noundef writ
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.and
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.and
-; CHECK: i8x16.shuffle {{.*}} 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.and
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.and
-; CHECK: i8x16.shuffle {{.*}} 0, 0, 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
+; CHECK: i8x16.shuffle  0, 0, 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23
+; CHECK: i8x16.shuffle  0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 0, 0, 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
+; CHECK: i8x16.shuffle  0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19, 0, 0
+; CHECK: i8x16.shuffle  0, 0, 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19
+; CHECK: i8x16.shuffle  0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
 ; CHECK: v128.store
 define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -740,25 +740,25 @@ define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly %
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}}, 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}}, 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extmul_low_i8x16_u
-; CHECK: i8x16.shuffle {{.*}}, 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}}, 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extmul_low_i8x16_u
-; CHECK: i8x16.shuffle {{.*}}, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
-; CHECK: i8x16.shuffle {{.*}}, 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}}, 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.sub
-; CHECK: i8x16.shuffle {{.*}}, 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}}, 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.sub
-; CHECK: i8x16.shuffle {{.*}}, 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}}, 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}}, 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27
+; CHECK: i8x16.shuffle  0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27
 ; CHECK: v128.store
 define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -806,27 +806,27 @@ define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.add
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.sub
-; CHECK: i8x16.shuffle {{.*}} 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.add
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.sub
-; CHECK: i8x16.shuffle {{.*}} 0, 0, 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
+; CHECK: i8x16.shuffle  0, 0, 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23
+; CHECK: i8x16.shuffle  0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 0, 0, 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
+; CHECK: i8x16.shuffle  0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19, 0, 0
+; CHECK: i8x16.shuffle  0, 0, 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19
+; CHECK: i8x16.shuffle  0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
 ; CHECK: v128.store
 define hidden void @four_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -1272,45 +1272,45 @@ define hidden void @four_bytes_into_four_ints_same_op(ptr noalias nocapture noun
 ; CHECK-LABEL: four_bytes_into_four_ints_vary_op:
 ; CHECK: loop
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i32x4.extend_low_i16x8_u
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i32x4.extend_low_i16x8_u
 ; CHECK: i32x4.add
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i32x4.extend_low_i16x8_u
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i32x4.extend_low_i16x8_u
 ; CHECK: i32x4.sub
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i32x4.extmul_low_i16x8_u
 ; CHECK: v128.and
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i32x4.extend_low_i16x8_u
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_bytes_into_four_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -1365,7 +1365,7 @@ define hidden void @four_bytes_into_four_ints_vary_op(ptr noalias nocapture noun
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
+; CHECK: i8x16.shuffle  0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
 ; CHECK: v128.store
 define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
   %5 = icmp sgt i32 %3, 0
@@ -1396,35 +1396,35 @@ define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 no
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i16x8.add
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i16x8.add
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i16x8.add
 ; CHECK: i16x8.add
 ; CHECK: i16x8.shr_u
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i16x8.add
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i16x8.add
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i16x8.add
 ; CHECK: i16x8.add
 ; CHECK: i16x8.shr_u
-; CHECK: i8x16.shuffle {{.*}} 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
+; CHECK: i8x16.shuffle  0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
 ; CHECK: v128.store
 define hidden void @scale_uv_row_down2_box(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
   %5 = icmp sgt i32 %3, 0
@@ -1492,13 +1492,13 @@ define hidden void @scale_uv_row_down2_box(ptr nocapture noundef readonly %0, i3
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.avgr_u 
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.avgr_u 
-; CHECK: i8x16.shuffle {{.*}} 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
+; CHECK: i8x16.shuffle  0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
 ; CHECK: v128.store
 define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
   %5 = icmp sgt i32 %3, 0
@@ -1605,28 +1605,28 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK-LABEL: two_bytes_two_floats_same_op:
 ; CHECK: loop
 ; CHECK: v128.load64_zero
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load64_zero
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s	
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
 ; CHECK: v128.store
 define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1663,28 +1663,28 @@ for.body:                                         ; preds = %entry, %for.body
 
 ; CHECK-LABEL: two_bytes_two_floats_vary_op:
 ; CHECK: v128.load64_zero
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load64_zero
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.add
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.sub
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
 ; CHECK: v128.store
 define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1723,38 +1723,24 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.const	255, 255, 255, 255
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle	0, 16, 1, 17, 2, 18, 3, 19, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.store64_lane
 define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1791,38 +1777,24 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: f32x4.add
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.const	255, 255, 255, 255
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 ; CHECK: f32x4.sub
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle	0, 16, 1, 17, 2, 18, 3, 19, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.store64_lane
 define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1858,24 +1830,24 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK-LABEL: two_shorts_two_floats_same_op:
 ; CHECK: loop
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
 ; CHECK: v128.store
 define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1913,24 +1885,24 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK-LABEL: two_shorts_two_floats_vary_op:
 ; CHECK: loop
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.add
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.sub
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
 ; CHECK: v128.store
 define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1969,38 +1941,22 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.const	65535, 65535, 65535, 65535
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23
 ; CHECK: v128.store
 define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2037,38 +1993,22 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: f32x4.add
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.const	65535, 65535, 65535, 65535
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 ; CHECK: f32x4.sub
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23
 ; CHECK: v128.store
 define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2195,58 +2135,58 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK-LABEL: four_bytes_four_floats_same_op:
 ; CHECK: loop
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2302,58 +2242,58 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK-LABEL: four_bytes_four_floats_vary_op:
 ; CHECK: loop
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.add
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.div
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.sub
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2410,88 +2350,60 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.const	255, 255, 255, 255
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle	0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle	0, 4, 24, 28, 1, 5, 25, 29, 2, 6, 26, 30, 3, 7, 27, 31
 ; CHECK: v128.store
 define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2544,88 +2456,60 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.const	255, 255, 255, 255
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.add
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.div
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.sub
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle	0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle	0, 4, 24, 28, 1, 5, 25, 29, 2, 6, 26, 30, 3, 7, 27, 31
 ; CHECK: v128.store
 define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2678,51 +2562,51 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2779,47 +2663,47 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.add
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.div
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.sub
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2876,89 +2760,58 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.const	65535, 65535, 65535, 65535
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31
 ; CHECK: v128.store
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
+; CHECK: i8x16.shuffle	0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27
 ; CHECK: v128.store
 define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -3011,89 +2864,58 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.const	65535, 65535, 65535, 65535
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.add
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.div
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.sub
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31
 ; CHECK: v128.store
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
+; CHECK: i8x16.shuffle	0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27
 ; CHECK: v128.store
 define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
diff --git a/llvm/test/CodeGen/X86/pr166744.ll b/llvm/test/CodeGen/X86/pr166744.ll
new file mode 100644
index 0000000..de2a5e6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr166744.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=POSTRA
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=haswell | FileCheck %s --check-prefixes=NOPOSTRA
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=NOPOSTRA
+
+; FIXME: Ensure reloads are after narrowed i512 -> i32 store
+define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) {
+; POSTRA-LABEL: PR166744:
+; POSTRA:       # %bb.0:
+; POSTRA-NEXT:    movl $1029, %eax # imm = 0x405
+; POSTRA-NEXT:    shlxl %esi, %edx, %edx
+; POSTRA-NEXT:    bextrl %eax, %esi, %eax
+; POSTRA-NEXT:    movl (%rdi,%rax,4), %ecx
+; POSTRA-NEXT:    btrl %esi, %ecx
+; POSTRA-NEXT:    movq 8(%rdi), %rsi
+; POSTRA-NEXT:    orq 40(%rdi), %rsi
+; POSTRA-NEXT:    orl %ecx, %edx
+; POSTRA-NEXT:    movl %edx, (%rdi,%rax,4)
+; POSTRA-NEXT:    movq 16(%rdi), %rax
+; POSTRA-NEXT:    movq 24(%rdi), %rdx
+; POSTRA-NEXT:    orq 56(%rdi), %rdx
+; POSTRA-NEXT:    orq 48(%rdi), %rax
+; POSTRA-NEXT:    movq (%rdi), %rcx
+; POSTRA-NEXT:    orq 32(%rdi), %rcx
+; POSTRA-NEXT:    orq %rdx, %rsi
+; POSTRA-NEXT:    orq %rax, %rcx
+; POSTRA-NEXT:    orq %rsi, %rcx
+; POSTRA-NEXT:    setne %al
+; POSTRA-NEXT:    retq
+;
+; NOPOSTRA-LABEL: PR166744:
+; NOPOSTRA:       # %bb.0:
+; NOPOSTRA-NEXT:    movl %esi, %eax
+; NOPOSTRA-NEXT:    shrl $3, %eax
+; NOPOSTRA-NEXT:    andl $60, %eax
+; NOPOSTRA-NEXT:    movl (%rdi,%rax), %ecx
+; NOPOSTRA-NEXT:    btrl %esi, %ecx
+; NOPOSTRA-NEXT:    shlxl %esi, %edx, %edx
+; NOPOSTRA-NEXT:    orl %ecx, %edx
+; NOPOSTRA-NEXT:    movl %edx, (%rdi,%rax)
+; NOPOSTRA-NEXT:    movq 16(%rdi), %rax
+; NOPOSTRA-NEXT:    movq (%rdi), %rcx
+; NOPOSTRA-NEXT:    movq 8(%rdi), %rdx
+; NOPOSTRA-NEXT:    movq 24(%rdi), %rsi
+; NOPOSTRA-NEXT:    orq 56(%rdi), %rsi
+; NOPOSTRA-NEXT:    orq 40(%rdi), %rdx
+; NOPOSTRA-NEXT:    orq 48(%rdi), %rax
+; NOPOSTRA-NEXT:    orq 32(%rdi), %rcx
+; NOPOSTRA-NEXT:    orq %rsi, %rdx
+; NOPOSTRA-NEXT:    orq %rax, %rcx
+; NOPOSTRA-NEXT:    orq %rdx, %rcx
+; NOPOSTRA-NEXT:    setne %al
+; NOPOSTRA-NEXT:    retq
+  %rem = and i64 %idx, 511
+  %sh_prom = zext nneg i64 %rem to i512
+  %shl = shl nuw i512 1, %sh_prom
+  %not = xor i512 %shl, -1
+  %load = load i512, ptr %v, align 8
+  %and = and i512 %load, %not
+  %conv2 = zext i1 %b to i512
+  %shl4 = shl nuw i512 %conv2, %sh_prom
+  %or = or i512 %and, %shl4
+  store i512 %or, ptr %v, align 8
+  %cmp = icmp ne i512 %or, 0
+  ret i1 %cmp
+}
diff --git a/llvm/test/MC/AMDGPU/gfx90a_err.s b/llvm/test/MC/AMDGPU/gfx90a_err.s
index 78e4f86..ff0dfb3 100644
--- a/llvm/test/MC/AMDGPU/gfx90a_err.s
+++ b/llvm/test/MC/AMDGPU/gfx90a_err.s
@@ -674,46 +674,3 @@ v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
 v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
 // GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
 
-// nv bit in FLAT instructions
-flat_load_ubyte v5, v[2:3] offset:4095 nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-flat_load_ubyte a5, v[2:3] offset:4095 nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-flat_store_dword v[2:3], v5 offset:4095 nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-flat_store_dword v[2:3], a5 offset:4095 nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-global_load_ubyte v5, v[2:3], off offset:-1 nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-global_store_byte v[2:3], v5, off offset:-1 nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-global_atomic_add v[2:3], v5, off nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-global_atomic_swap a1, v[2:3], a2, off glc nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-global_atomic_swap_x2 v[2:3], v[4:5], off nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-global_atomic_swap_x2 v[2:3], a[4:5], off nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-scratch_load_ubyte v5, off, s2 offset:-1 nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-scratch_load_ubyte a5, off, s2 offset:-1 nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-scratch_store_dword v2, v3, off nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
diff --git a/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s b/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
index 3af0d83..c96a72d 100644
--- a/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
+++ b/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
@@ -706,107 +706,107 @@ flat_load_short_d16_hi a5, v[2:3] offset:4095 glc
 flat_load_short_d16_hi a5, v[2:3] offset:4095 slc
 
 // GFX90A: flat_atomic_swap a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x01,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_swap a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_cmpswap a0, v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x05,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_cmpswap a0, v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_add a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x09,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_add a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_sub a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x0d,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_sub a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_smin a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x11,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_smin a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_umin a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x15,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_umin a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_smax a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x19,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_smax a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_umax a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x1d,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_umax a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_and a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x21,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_and a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_or a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x25,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_or a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_xor a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x29,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_xor a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_inc a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x2d,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_inc a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_dec a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x31,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_dec a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_swap_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x81,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_swap_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_cmpswap_x2 a[0:1], v[2:3], a[2:5] offset:4095 glc ; encoding: [0xff,0x0f,0x85,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_cmpswap_x2 a[0:1], v[2:3], a[2:5] offset:4095 glc
 
 // GFX90A: flat_atomic_add_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x89,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_add_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_sub_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x8d,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_sub_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_smin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x91,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_smin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_umin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x95,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_umin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_smax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x99,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_smax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_umax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x9d,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_umax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_and_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa1,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_and_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_or_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa5,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_or_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_xor_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa9,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_xor_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_inc_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xad,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_inc_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_dec_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xb1,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_dec_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_swap v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x00,0xdd,0x02,0x02,0x80,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx942_err.s b/llvm/test/MC/AMDGPU/gfx942_err.s
index dc51bab..fd59a01 100644
--- a/llvm/test/MC/AMDGPU/gfx942_err.s
+++ b/llvm/test/MC/AMDGPU/gfx942_err.s
@@ -125,31 +125,3 @@ global_load_dword v[2:3], off lds
 
 scratch_load_dword v2, off lds
 // GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-// nv bit in FLAT instructions
-flat_load_ubyte v5, v[2:3] offset:4095 nv
-// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-flat_store_dword v[2:3], v5 offset:4095 nv
-// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-flat_atomic_add_f32 v[2:3], v5 nv
-// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-global_load_dword v2, v[2:3], off sc0 nv
-// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-global_store_dword v[2:3], v5 off sc0 nv
-// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-global_atomic_add_f64 v[0:1], v[2:3], off sc1 nv
-// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-global_atomic_swap v0, v[2:3], v5 off sc0 nv
-// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-scratch_load_lds_dword v2, off nv
-// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-scratch_store_dword v2, v3, off nv
-// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx9_asm_flat.s b/llvm/test/MC/AMDGPU/gfx9_asm_flat.s
index 7687c0a..5cc3d25 100644
--- a/llvm/test/MC/AMDGPU/gfx9_asm_flat.s
+++ b/llvm/test/MC/AMDGPU/gfx9_asm_flat.s
@@ -24,18 +24,6 @@ flat_load_ubyte v5, v[1:2] offset:4095 glc
 flat_load_ubyte v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x42,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_ubyte v5, v[1:2] nv
-// CHECK: [0x00,0x00,0x40,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ubyte v5, v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x40,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ubyte v5, v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x41,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ubyte v5, v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x42,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_sbyte v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x44,0xdc,0x01,0x00,0x00,0x05]
 
@@ -60,18 +48,6 @@ flat_load_sbyte v5, v[1:2] offset:4095 glc
 flat_load_sbyte v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x46,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_sbyte v5, v[1:2] nv
-// CHECK: [0x00,0x00,0x44,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sbyte v5, v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x44,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sbyte v5, v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x45,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sbyte v5, v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x46,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_ushort v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x48,0xdc,0x01,0x00,0x00,0x05]
 
@@ -96,18 +72,6 @@ flat_load_ushort v5, v[1:2] offset:4095 glc
 flat_load_ushort v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x4a,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_ushort v5, v[1:2] nv
-// CHECK: [0x00,0x00,0x48,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ushort v5, v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x48,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ushort v5, v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x49,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ushort v5, v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x4a,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_sshort v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x4c,0xdc,0x01,0x00,0x00,0x05]
 
@@ -132,18 +96,6 @@ flat_load_sshort v5, v[1:2] offset:4095 glc
 flat_load_sshort v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x4e,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_sshort v5, v[1:2] nv
-// CHECK: [0x00,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sshort v5, v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sshort v5, v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x4d,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sshort v5, v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x4e,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_dword v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x50,0xdc,0x01,0x00,0x00,0x05]
 
@@ -168,18 +120,6 @@ flat_load_dword v5, v[1:2] offset:4095 glc
 flat_load_dword v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x52,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dword v5, v[1:2] nv
-// CHECK: [0x00,0x00,0x50,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dword v5, v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x50,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dword v5, v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x51,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dword v5, v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x52,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_dwordx2 v[5:6], v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x54,0xdc,0x01,0x00,0x00,0x05]
 
@@ -204,18 +144,6 @@ flat_load_dwordx2 v[5:6], v[1:2] offset:4095 glc
 flat_load_dwordx2 v[5:6], v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x56,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx2 v[5:6], v[1:2] nv
-// CHECK: [0x00,0x00,0x54,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dwordx2 v[5:6], v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x54,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dwordx2 v[5:6], v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x55,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dwordx2 v[5:6], v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x56,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_dwordx3 v[5:7], v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x58,0xdc,0x01,0x00,0x00,0x05]
 
@@ -240,18 +168,6 @@ flat_load_dwordx3 v[5:7], v[1:2] offset:4095 glc
 flat_load_dwordx3 v[5:7], v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x5a,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx3 v[5:7], v[1:2] nv
-// CHECK: [0x00,0x00,0x58,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dwordx3 v[5:7], v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x58,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dwordx3 v[5:7], v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x59,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dwordx3 v[5:7], v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x5a,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_dwordx4 v[5:8], v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x5c,0xdc,0x01,0x00,0x00,0x05]
 
@@ -276,18 +192,6 @@ flat_load_dwordx4 v[5:8], v[1:2] offset:4095 glc
 flat_load_dwordx4 v[5:8], v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x5e,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx4 v[5:8], v[1:2] nv
-// CHECK: [0x00,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dwordx4 v[5:8], v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dwordx4 v[5:8], v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x5d,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dwordx4 v[5:8], v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x5e,0xdc,0x01,0x00,0x80,0x05]
-
 flat_store_byte v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x60,0xdc,0x01,0x02,0x00,0x00]
 
@@ -312,18 +216,6 @@ flat_store_byte v[1:2], v2 offset:4095 glc
 flat_store_byte v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x62,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_byte v[1:2], v2 nv
-// CHECK: [0x00,0x00,0x60,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_byte v[1:2], v2 offset:7 nv
-// CHECK: [0x07,0x00,0x60,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_byte v[1:2], v2 offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x61,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_byte v[1:2], v2 offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x62,0xdc,0x01,0x02,0x80,0x00]
-
 flat_store_byte_d16_hi v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x64,0xdc,0x01,0x02,0x00,0x00]
 
@@ -348,18 +240,6 @@ flat_store_byte_d16_hi v[1:2], v2 offset:4095 glc
 flat_store_byte_d16_hi v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x66,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_byte_d16_hi v[1:2], v2 nv
-// CHECK: [0x00,0x00,0x64,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_byte_d16_hi v[1:2], v2 offset:7 nv
-// CHECK: [0x07,0x00,0x64,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_byte_d16_hi v[1:2], v2 offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x65,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_byte_d16_hi v[1:2], v2 offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x66,0xdc,0x01,0x02,0x80,0x00]
-
 flat_store_short v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x68,0xdc,0x01,0x02,0x00,0x00]
 
@@ -384,18 +264,6 @@ flat_store_short v[1:2], v2 offset:4095 glc
 flat_store_short v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x6a,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_short v[1:2], v2 nv
-// CHECK: [0x00,0x00,0x68,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_short v[1:2], v2 offset:7 nv
-// CHECK: [0x07,0x00,0x68,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_short v[1:2], v2 offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x69,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_short v[1:2], v2 offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x6a,0xdc,0x01,0x02,0x80,0x00]
-
 flat_store_short_d16_hi v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x6c,0xdc,0x01,0x02,0x00,0x00]
 
@@ -420,18 +288,6 @@ flat_store_short_d16_hi v[1:2], v2 offset:4095 glc
 flat_store_short_d16_hi v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x6e,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_short_d16_hi v[1:2], v2 nv
-// CHECK: [0x00,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_short_d16_hi v[1:2], v2 offset:7 nv
-// CHECK: [0x07,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_short_d16_hi v[1:2], v2 offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x6d,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_short_d16_hi v[1:2], v2 offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x6e,0xdc,0x01,0x02,0x80,0x00]
-
 flat_store_dword v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x70,0xdc,0x01,0x02,0x00,0x00]
 
@@ -456,18 +312,6 @@ flat_store_dword v[1:2], v2 offset:4095 glc
 flat_store_dword v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x72,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dword v[1:2], v2 nv
-// CHECK: [0x00,0x00,0x70,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dword v[1:2], v2 offset:7 nv
-// CHECK: [0x07,0x00,0x70,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dword v[1:2], v2 offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x71,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dword v[1:2], v2 offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x72,0xdc,0x01,0x02,0x80,0x00]
-
 flat_store_dwordx2 v[1:2], v[2:3] offset:4095
 // CHECK: [0xff,0x0f,0x74,0xdc,0x01,0x02,0x00,0x00]
 
@@ -492,18 +336,6 @@ flat_store_dwordx2 v[1:2], v[2:3] offset:4095 glc
 flat_store_dwordx2 v[1:2], v[2:3] offset:4095 slc
 // CHECK: [0xff,0x0f,0x76,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dwordx2 v[1:2], v[2:3] nv
-// CHECK: [0x00,0x00,0x74,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dwordx2 v[1:2], v[2:3] offset:7 nv
-// CHECK: [0x07,0x00,0x74,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dwordx2 v[1:2], v[2:3] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x75,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dwordx2 v[1:2], v[2:3] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x76,0xdc,0x01,0x02,0x80,0x00]
-
 flat_store_dwordx3 v[1:2], v[2:4] offset:4095
 // CHECK: [0xff,0x0f,0x78,0xdc,0x01,0x02,0x00,0x00]
 
@@ -528,18 +360,6 @@ flat_store_dwordx3 v[1:2], v[2:4] offset:4095 glc
 flat_store_dwordx3 v[1:2], v[2:4] offset:4095 slc
 // CHECK: [0xff,0x0f,0x7a,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dwordx3 v[1:2], v[2:4] nv
-// CHECK: [0x00,0x00,0x78,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dwordx3 v[1:2], v[2:4] offset:7 nv
-// CHECK: [0x07,0x00,0x78,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dwordx3 v[1:2], v[2:4] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x79,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dwordx3 v[1:2], v[2:4] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x7a,0xdc,0x01,0x02,0x80,0x00]
-
 flat_store_dwordx4 v[1:2], v[2:5] offset:4095
 // CHECK: [0xff,0x0f,0x7c,0xdc,0x01,0x02,0x00,0x00]
 
@@ -564,18 +384,6 @@ flat_store_dwordx4 v[1:2], v[2:5] offset:4095 glc
 flat_store_dwordx4 v[1:2], v[2:5] offset:4095 slc
 // CHECK: [0xff,0x0f,0x7e,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dwordx4 v[1:2], v[2:5] nv
-// CHECK: [0x00,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dwordx4 v[1:2], v[2:5] offset:7 nv
-// CHECK: [0x07,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dwordx4 v[1:2], v[2:5] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x7d,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dwordx4 v[1:2], v[2:5] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x7e,0xdc,0x01,0x02,0x80,0x00]
-
 flat_load_ubyte_d16 v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x80,0xdc,0x01,0x00,0x00,0x05]
 
@@ -600,18 +408,6 @@ flat_load_ubyte_d16 v5, v[1:2] offset:4095 glc
 flat_load_ubyte_d16 v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x82,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_ubyte_d16 v5, v[1:2] nv
-// CHECK: [0x00,0x00,0x80,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ubyte_d16 v5, v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x80,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ubyte_d16 v5, v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x81,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ubyte_d16 v5, v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x82,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_ubyte_d16_hi v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x84,0xdc,0x01,0x00,0x00,0x05]
 
@@ -636,18 +432,6 @@ flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 glc
 flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x86,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_ubyte_d16_hi v5, v[1:2] nv
-// CHECK: [0x00,0x00,0x84,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ubyte_d16_hi v5, v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x84,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x85,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x86,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_sbyte_d16 v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x88,0xdc,0x01,0x00,0x00,0x05]
 
@@ -672,18 +456,6 @@ flat_load_sbyte_d16 v5, v[1:2] offset:4095 glc
 flat_load_sbyte_d16 v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x8a,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_sbyte_d16 v5, v[1:2] nv
-// CHECK: [0x00,0x00,0x88,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sbyte_d16 v5, v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x88,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sbyte_d16 v5, v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x89,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sbyte_d16 v5, v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x8a,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_sbyte_d16_hi v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x8c,0xdc,0x01,0x00,0x00,0x05]
 
@@ -708,18 +480,6 @@ flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 glc
 flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x8e,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_sbyte_d16_hi v5, v[1:2] nv
-// CHECK: [0x00,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sbyte_d16_hi v5, v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x8d,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x8e,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_short_d16 v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x90,0xdc,0x01,0x00,0x00,0x05]
 
@@ -744,18 +504,6 @@ flat_load_short_d16 v5, v[1:2] offset:4095 glc
 flat_load_short_d16 v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x92,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_short_d16 v5, v[1:2] nv
-// CHECK: [0x00,0x00,0x90,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_short_d16 v5, v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x90,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_short_d16 v5, v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x91,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_short_d16 v5, v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x92,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_short_d16_hi v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x94,0xdc,0x01,0x00,0x00,0x05]
 
@@ -780,18 +528,6 @@ flat_load_short_d16_hi v5, v[1:2] offset:4095 glc
 flat_load_short_d16_hi v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x96,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_short_d16_hi v5, v[1:2] nv
-// CHECK: [0x00,0x00,0x94,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_short_d16_hi v5, v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x94,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_short_d16_hi v5, v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x95,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_short_d16_hi v5, v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x96,0xdc,0x01,0x00,0x80,0x05]
-
 flat_atomic_swap v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x00,0xdd,0x01,0x02,0x00,0x00]
 
@@ -816,18 +552,6 @@ flat_atomic_swap v0, v[1:2], v2 offset:4095 glc
 flat_atomic_swap v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x02,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_swap v[1:2], v2 nv
-// CHECK: [0x00,0x00,0x00,0xdd,0x01,0x02,0x80,0x00]
-
-flat_atomic_swap v[1:2], v2 offset:7 nv
-// CHECK: [0x07,0x00,0x00,0xdd,0x01,0x02,0x80,0x00]
-
-flat_atomic_swap v0, v[1:2], v2 offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x01,0xdd,0x01,0x02,0x80,0x00]
-
-flat_atomic_swap v[1:2], v2 offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x02,0xdd,0x01,0x02,0x80,0x00]
-
 flat_atomic_cmpswap v[1:2], v[2:3] offset:4095
 // CHECK: [0xff,0x0f,0x04,0xdd,0x01,0x02,0x00,0x00]
 
@@ -852,18 +576,6 @@ flat_atomic_cmpswap v0, v[1:2], v[2:3] offset:4095 glc
 flat_atomic_cmpswap v[1:2], v[2:3] offset:4095 slc
 // CHECK: [0xff,0x0f,0x06,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_cmpswap v[1:2], v[2:3] nv
-// CHECK: [0x00,0x00,0x04,0xdd,0x01,0x02,0x80,0x00]
-
-flat_atomic_cmpswap v[1:2], v[2:3] offset:7 nv
-// CHECK: [0x07,0x00,0x04,0xdd,0x01,0x02,0x80,0x00]
-
-flat_atomic_cmpswap v0, v[1:2], v[2:3] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x05,0xdd,0x01,0x02,0x80,0x00]
-
-flat_atomic_cmpswap v[1:2], v[2:3] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x06,0xdd,0x01,0x02,0x80,0x00]
-
 flat_atomic_add v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x08,0xdd,0x01,0x02,0x00,0x00]
 
@@ -888,18 +600,6 @@ flat_atomic_add v0, v[1:2], v2 offset:4095 glc
 flat_atomic_add v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x0a,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_add v[1:2], v2 nv
-// CHECK: [0x00,0x00,0x08,0xdd,0x01,0x02,0x80,0x00]
-
-flat_atomic_add v[1:2], v2 offset:7 nv
-// CHECK: [0x07,0x00,0x08,0xdd,0x01,0x02,0x80,0x00]
-
-flat_atomic_add v0, v[1:2], v2 offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x09,0xdd,0x01,0x02,0x80,0x00]
-
-flat_atomic_add v[1:2], v2 offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x0a,0xdd,0x01,0x02,0x80,0x00]
-
 flat_atomic_sub v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x0c,0xdd,0x01,0x02,0x00,0x00]
 
@@ -1497,18 +1197,6 @@ global_load_ubyte v5, v1, s[4:5] offset:-1 glc
 global_load_ubyte v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x42,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_ubyte v5, v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x40,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ubyte v5, v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x40,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ubyte v5, v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x41,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ubyte v5, v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x42,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_sbyte v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x44,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1554,18 +1242,6 @@ global_load_sbyte v5, v1, s[4:5] offset:-1 glc
 global_load_sbyte v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x46,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_sbyte v5, v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x44,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sbyte v5, v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x44,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sbyte v5, v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x45,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sbyte v5, v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x46,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_ushort v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x48,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1611,18 +1287,6 @@ global_load_ushort v5, v1, s[4:5] offset:-1 glc
 global_load_ushort v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x4a,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_ushort v5, v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x48,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ushort v5, v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x48,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ushort v5, v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x49,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ushort v5, v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x4a,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_sshort v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x4c,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1668,18 +1332,6 @@ global_load_sshort v5, v1, s[4:5] offset:-1 glc
 global_load_sshort v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x4e,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_sshort v5, v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x4c,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sshort v5, v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x4c,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sshort v5, v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x4d,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sshort v5, v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x4e,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_dword v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x50,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1725,18 +1377,6 @@ global_load_dword v5, v1, s[4:5] offset:-1 glc
 global_load_dword v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x52,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_dword v5, v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x50,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_dword v5, v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x50,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_dword v5, v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x51,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_dword v5, v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x52,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x54,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1782,18 +1422,6 @@ global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 glc
 global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x56,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_dwordx2 v[5:6], v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x54,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x54,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x55,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x56,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x58,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1839,15 +1467,6 @@ global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 glc
 global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x5a,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_dwordx3 v[5:7], v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x58,0xdc,0x01,0x00,0x84,0x05]
-global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x58,0xdc,0x01,0x00,0x84,0x05]
-global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x59,0xdc,0x01,0x00,0x84,0x05]
-global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x5a,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x5c,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1893,15 +1512,6 @@ global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 glc
 global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x5e,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_dwordx4 v[5:8], v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x5c,0xdc,0x01,0x00,0x84,0x05]
-global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x5c,0xdc,0x01,0x00,0x84,0x05]
-global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x5d,0xdc,0x01,0x00,0x84,0x05]
-global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x5e,0xdc,0x01,0x00,0x84,0x05]
-
 global_store_byte v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x60,0xdc,0x01,0x02,0x06,0x00]
 
@@ -1947,18 +1557,6 @@ global_store_byte v1, v2, s[6:7] offset:-1 glc
 global_store_byte v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x62,0xdc,0x01,0x02,0x06,0x00]
 
-global_store_byte v1, v2, s[6:7] nv
-// CHECK: [0x00,0x80,0x60,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_byte v1, v2, s[6:7] offset:-1 nv
-// CHECK: [0xff,0x9f,0x60,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_byte v1, v2, s[6:7] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x61,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_byte v1, v2, s[6:7] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x62,0xdc,0x01,0x02,0x86,0x00]
-
 global_store_byte_d16_hi v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x64,0xdc,0x01,0x02,0x06,0x00]
 
@@ -2004,18 +1602,6 @@ global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 glc
 global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x66,0xdc,0x01,0x02,0x06,0x00]
 
-global_store_byte_d16_hi v1, v2, s[6:7] nv
-// CHECK: [0x00,0x80,0x64,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 nv
-// CHECK: [0xff,0x9f,0x64,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x65,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x66,0xdc,0x01,0x02,0x86,0x00]
-
 global_store_short v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x68,0xdc,0x01,0x02,0x06,0x00]
 
@@ -2061,18 +1647,6 @@ global_store_short v1, v2, s[6:7] offset:-1 glc
 global_store_short v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x6a,0xdc,0x01,0x02,0x06,0x00]
 
-global_store_short v1, v2, s[6:7] nv
-// CHECK: [0x00,0x80,0x68,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_short v1, v2, s[6:7] offset:-1 nv
-// CHECK: [0xff,0x9f,0x68,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_short v1, v2, s[6:7] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x69,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_short v1, v2, s[6:7] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x6a,0xdc,0x01,0x02,0x86,0x00]
-
 global_store_short_d16_hi v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x6c,0xdc,0x01,0x02,0x06,0x00]
 
@@ -2118,18 +1692,6 @@ global_store_short_d16_hi v1, v2, s[6:7] offset:-1 glc
 global_store_short_d16_hi v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x6e,0xdc,0x01,0x02,0x06,0x00]
 
-global_store_short_d16_hi v1, v2, s[6:7] nv
-// CHECK: [0x00,0x80,0x6c,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_short_d16_hi v1, v2, s[6:7] offset:-1 nv
-// CHECK: [0xff,0x9f,0x6c,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_short_d16_hi v1, v2, s[6:7] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x6d,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_short_d16_hi v1, v2, s[6:7] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x6e,0xdc,0x01,0x02,0x86,0x00]
-
 global_store_dword v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x70,0xdc,0x01,0x02,0x06,0x00]
 
@@ -2175,18 +1737,6 @@ global_store_dword v1, v2, s[6:7] offset:-1 glc
 global_store_dword v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x72,0xdc,0x01,0x02,0x06,0x00]
 
-global_store_dword v1, v2, s[6:7] nv
-// CHECK: [0x00,0x80,0x70,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dword v1, v2, s[6:7] offset:-1 nv
-// CHECK: [0xff,0x9f,0x70,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dword v1, v2, s[6:7] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x71,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dword v1, v2, s[6:7] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x72,0xdc,0x01,0x02,0x86,0x00]
-
 global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x74,0xdc,0x01,0x02,0x06,0x00]
 
@@ -2232,18 +1782,6 @@ global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 glc
 global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x76,0xdc,0x01,0x02,0x06,0x00]
 
-global_store_dwordx2 v1, v[2:3], s[6:7] nv
-// CHECK: [0x00,0x80,0x74,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 nv
-// CHECK: [0xff,0x9f,0x74,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x75,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x76,0xdc,0x01,0x02,0x86,0x00]
-
 global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x78,0xdc,0x01,0x02,0x06,0x00]
 
@@ -2289,18 +1827,6 @@ global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 glc
 global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x7a,0xdc,0x01,0x02,0x06,0x00]
 
-global_store_dwordx3 v1, v[2:4], s[6:7] nv
-// CHECK: [0x00,0x80,0x78,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 nv
-// CHECK: [0xff,0x9f,0x78,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x79,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x7a,0xdc,0x01,0x02,0x86,0x00]
-
 global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x7c,0xdc,0x01,0x02,0x06,0x00]
 
@@ -2346,18 +1872,6 @@ global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 glc
 global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x7e,0xdc,0x01,0x02,0x06,0x00]
 
-global_store_dwordx4 v1, v[2:5], s[6:7] nv
-// CHECK: [0x00,0x80,0x7c,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 nv
-// CHECK: [0xff,0x9f,0x7c,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x7d,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x7e,0xdc,0x01,0x02,0x86,0x00]
-
 global_load_ubyte_d16 v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x80,0xdc,0x01,0x00,0x04,0x05]
 
@@ -2403,18 +1917,6 @@ global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 glc
 global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x82,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_ubyte_d16 v5, v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x80,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x80,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x81,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x82,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x84,0xdc,0x01,0x00,0x04,0x05]
 
@@ -2460,18 +1962,6 @@ global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 glc
 global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x86,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_ubyte_d16_hi v5, v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x84,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x84,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x85,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x86,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_sbyte_d16 v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x88,0xdc,0x01,0x00,0x04,0x05]
 
@@ -2517,18 +2007,6 @@ global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 glc
 global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x8a,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_sbyte_d16 v5, v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x88,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x88,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x89,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x8a,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x8c,0xdc,0x01,0x00,0x04,0x05]
 
@@ -2574,18 +2052,6 @@ global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 glc
 global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x8e,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_sbyte_d16_hi v5, v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x8c,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x8c,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x8d,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x8e,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_short_d16 v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x90,0xdc,0x01,0x00,0x04,0x05]
 
@@ -2631,18 +2097,6 @@ global_load_short_d16 v5, v1, s[4:5] offset:-1 glc
 global_load_short_d16 v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x92,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_short_d16 v5, v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x90,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_short_d16 v5, v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x90,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_short_d16 v5, v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x91,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_short_d16 v5, v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x92,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_short_d16_hi v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x94,0xdc,0x01,0x00,0x04,0x05]
 
@@ -2688,18 +2142,6 @@ global_load_short_d16_hi v5, v1, s[4:5] offset:-1 glc
 global_load_short_d16_hi v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x96,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_short_d16_hi v5, v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x94,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_short_d16_hi v5, v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x94,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_short_d16_hi v5, v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x95,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_short_d16_hi v5, v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x96,0xdc,0x01,0x00,0x84,0x05]
-
 global_atomic_swap v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x00,0xdd,0x01,0x02,0x06,0x00]
 
@@ -2745,18 +2187,6 @@ global_atomic_swap v0, v1, v2, s[6:7] offset:-1 glc
 global_atomic_swap v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x02,0xdd,0x01,0x02,0x06,0x00]
 
-global_atomic_swap v1, v2, s[6:7] nv
-// CHECK: [0x00,0x80,0x00,0xdd,0x01,0x02,0x86,0x00]
-
-global_atomic_swap v1, v2, s[6:7] offset:-1 nv
-// CHECK: [0xff,0x9f,0x00,0xdd,0x01,0x02,0x86,0x00]
-
-global_atomic_swap v0, v1, v2, s[6:7] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x01,0xdd,0x01,0x02,0x86,0x00]
-
-global_atomic_swap v1, v2, s[6:7] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x02,0xdd,0x01,0x02,0x86,0x00]
-
 global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x04,0xdd,0x01,0x02,0x06,0x00]
 
@@ -2802,18 +2232,6 @@ global_atomic_cmpswap v0, v1, v[2:3], s[6:7] offset:-1 glc
 global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x06,0xdd,0x01,0x02,0x06,0x00]
 
-global_atomic_cmpswap v1, v[2:3], s[6:7] nv
-// CHECK: [0x00,0x80,0x04,0xdd,0x01,0x02,0x86,0x00]
-
-global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1 nv
-// CHECK: [0xff,0x9f,0x04,0xdd,0x01,0x02,0x86,0x00]
-
-global_atomic_cmpswap v0, v1, v[2:3], s[6:7] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x05,0xdd,0x01,0x02,0x86,0x00]
-
-global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x06,0xdd,0x01,0x02,0x86,0x00]
-
 global_atomic_add v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x08,0xdd,0x01,0x02,0x06,0x00]
 
@@ -2859,18 +2277,6 @@ global_atomic_add v0, v1, v2, s[6:7] offset:-1 glc
 global_atomic_add v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x0a,0xdd,0x01,0x02,0x06,0x00]
 
-global_atomic_add v1, v2, s[6:7] nv
-// CHECK: [0x00,0x80,0x08,0xdd,0x01,0x02,0x86,0x00]
-
-global_atomic_add v1, v2, s[6:7] offset:-1 nv
-// CHECK: [0xff,0x9f,0x08,0xdd,0x01,0x02,0x86,0x00]
-
-global_atomic_add v0, v1, v2, s[6:7] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x09,0xdd,0x01,0x02,0x86,0x00]
-
-global_atomic_add v1, v2, s[6:7] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x0a,0xdd,0x01,0x02,0x86,0x00]
-
 global_atomic_sub v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x0c,0xdd,0x01,0x02,0x06,0x00]
 
@@ -3951,18 +3357,6 @@ scratch_load_ubyte v5, off, s2 offset:-1 glc
 scratch_load_ubyte v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x42,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_ubyte v5, off, s2 nv
-// CHECK: [0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ubyte v5, off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ubyte v5, off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x41,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ubyte v5, off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x42,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_sbyte v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4008,18 +3402,6 @@ scratch_load_sbyte v5, off, s2 offset:-1 glc
 scratch_load_sbyte v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x46,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_sbyte v5, off, s2 nv
-// CHECK: [0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sbyte v5, off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sbyte v5, off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x45,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sbyte v5, off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x46,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_ushort v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4065,18 +3447,6 @@ scratch_load_ushort v5, off, s2 offset:-1 glc
 scratch_load_ushort v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x4a,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_ushort v5, off, s2 nv
-// CHECK: [0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ushort v5, off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ushort v5, off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x49,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ushort v5, off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x4a,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_sshort v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4122,18 +3492,6 @@ scratch_load_sshort v5, off, s2 offset:-1 glc
 scratch_load_sshort v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x4e,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_sshort v5, off, s2 nv
-// CHECK: [0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sshort v5, off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sshort v5, off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x4d,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sshort v5, off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x4e,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_dword v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4179,18 +3537,6 @@ scratch_load_dword v5, off, s2 offset:-1 glc
 scratch_load_dword v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x52,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_dword v5, off, s2 nv
-// CHECK: [0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dword v5, off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dword v5, off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x51,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dword v5, off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x52,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_dwordx2 v[5:6], off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4236,18 +3582,6 @@ scratch_load_dwordx2 v[5:6], off, s2 offset:-1 glc
 scratch_load_dwordx2 v[5:6], off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x56,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_dwordx2 v[5:6], off, s2 nv
-// CHECK: [0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dwordx2 v[5:6], off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dwordx2 v[5:6], off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x55,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dwordx2 v[5:6], off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x56,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_dwordx3 v[5:7], off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4293,18 +3627,6 @@ scratch_load_dwordx3 v[5:7], off, s2 offset:-1 glc
 scratch_load_dwordx3 v[5:7], off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x5a,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_dwordx3 v[5:7], off, s2 nv
-// CHECK: [0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dwordx3 v[5:7], off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dwordx3 v[5:7], off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x59,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dwordx3 v[5:7], off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x5a,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_dwordx4 v[5:8], off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4350,18 +3672,6 @@ scratch_load_dwordx4 v[5:8], off, s2 offset:-1 glc
 scratch_load_dwordx4 v[5:8], off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x5e,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_dwordx4 v[5:8], off, s2 nv
-// CHECK: [0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dwordx4 v[5:8], off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dwordx4 v[5:8], off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x5d,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dwordx4 v[5:8], off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x5e,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_store_byte off, v2, s3 offset:-1
 // CHECK: [0xff,0x5f,0x60,0xdc,0x00,0x02,0x03,0x00]
 
@@ -4407,18 +3717,6 @@ scratch_store_byte off, v2, s3 offset:-1 glc
 scratch_store_byte off, v2, s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x62,0xdc,0x00,0x02,0x03,0x00]
 
-scratch_store_byte off, v2, s3 nv
-// CHECK: [0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_byte off, v2, s3 offset:-1 nv
-// CHECK: [0xff,0x5f,0x60,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_byte off, v2, s3 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x61,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_byte off, v2, s3 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x62,0xdc,0x00,0x02,0x83,0x00]
-
 scratch_store_byte_d16_hi off, v2, s3 offset:-1
 // CHECK: [0xff,0x5f,0x64,0xdc,0x00,0x02,0x03,0x00]
 
@@ -4464,18 +3762,6 @@ scratch_store_byte_d16_hi off, v2, s3 offset:-1 glc
 scratch_store_byte_d16_hi off, v2, s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x66,0xdc,0x00,0x02,0x03,0x00]
 
-scratch_store_byte_d16_hi off, v2, s3 nv
-// CHECK: [0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_byte_d16_hi off, v2, s3 offset:-1 nv
-// CHECK: [0xff,0x5f,0x64,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_byte_d16_hi off, v2, s3 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x65,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_byte_d16_hi off, v2, s3 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x66,0xdc,0x00,0x02,0x83,0x00]
-
 scratch_store_short off, v2, s3 offset:-1
 // CHECK: [0xff,0x5f,0x68,0xdc,0x00,0x02,0x03,0x00]
 
@@ -4521,18 +3807,6 @@ scratch_store_short off, v2, s3 offset:-1 glc
 scratch_store_short off, v2, s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x6a,0xdc,0x00,0x02,0x03,0x00]
 
-scratch_store_short off, v2, s3 nv
-// CHECK: [0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_short off, v2, s3 offset:-1 nv
-// CHECK: [0xff,0x5f,0x68,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_short off, v2, s3 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x69,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_short off, v2, s3 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x6a,0xdc,0x00,0x02,0x83,0x00]
-
 scratch_store_short_d16_hi off, v2, s3 offset:-1
 // CHECK: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0x03,0x00]
 
@@ -4578,18 +3852,6 @@ scratch_store_short_d16_hi off, v2, s3 offset:-1 glc
 scratch_store_short_d16_hi off, v2, s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x6e,0xdc,0x00,0x02,0x03,0x00]
 
-scratch_store_short_d16_hi off, v2, s3 nv
-// CHECK: [0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_short_d16_hi off, v2, s3 offset:-1 nv
-// CHECK: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_short_d16_hi off, v2, s3 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x6d,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_short_d16_hi off, v2, s3 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x6e,0xdc,0x00,0x02,0x83,0x00]
-
 scratch_store_dword off, v2, s3 offset:-1
 // CHECK: [0xff,0x5f,0x70,0xdc,0x00,0x02,0x03,0x00]
 
@@ -4635,18 +3897,6 @@ scratch_store_dword off, v2, s3 offset:-1 glc
 scratch_store_dword off, v2, s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x72,0xdc,0x00,0x02,0x03,0x00]
 
-scratch_store_dword off, v2, s3 nv
-// CHECK: [0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dword off, v2, s3 offset:-1 nv
-// CHECK: [0xff,0x5f,0x70,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dword off, v2, s3 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x71,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dword off, v2, s3 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x72,0xdc,0x00,0x02,0x83,0x00]
-
 scratch_store_dwordx2 off, v[2:3], s3 offset:-1
 // CHECK: [0xff,0x5f,0x74,0xdc,0x00,0x02,0x03,0x00]
 
@@ -4692,18 +3942,6 @@ scratch_store_dwordx2 off, v[2:3], s3 offset:-1 glc
 scratch_store_dwordx2 off, v[2:3], s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x76,0xdc,0x00,0x02,0x03,0x00]
 
-scratch_store_dwordx2 off, v[2:3], s3 nv
-// CHECK: [0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dwordx2 off, v[2:3], s3 offset:-1 nv
-// CHECK: [0xff,0x5f,0x74,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dwordx2 off, v[2:3], s3 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x75,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dwordx2 off, v[2:3], s3 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x76,0xdc,0x00,0x02,0x83,0x00]
-
 scratch_store_dwordx3 off, v[2:4], s3 offset:-1
 // CHECK: [0xff,0x5f,0x78,0xdc,0x00,0x02,0x03,0x00]
 
@@ -4749,18 +3987,6 @@ scratch_store_dwordx3 off, v[2:4], s3 offset:-1 glc
 scratch_store_dwordx3 off, v[2:4], s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x7a,0xdc,0x00,0x02,0x03,0x00]
 
-scratch_store_dwordx3 off, v[2:4], s3 nv
-// CHECK: [0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dwordx3 off, v[2:4], s3 offset:-1 nv
-// CHECK: [0xff,0x5f,0x78,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dwordx3 off, v[2:4], s3 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x79,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dwordx3 off, v[2:4], s3 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x7a,0xdc,0x00,0x02,0x83,0x00]
-
 scratch_store_dwordx4 off, v[2:5], s3 offset:-1
 // CHECK: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0x03,0x00]
 
@@ -4806,18 +4032,6 @@ scratch_store_dwordx4 off, v[2:5], s3 offset:-1 glc
 scratch_store_dwordx4 off, v[2:5], s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x7e,0xdc,0x00,0x02,0x03,0x00]
 
-scratch_store_dwordx4 off, v[2:5], s3 nv
-// CHECK: [0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dwordx4 off, v[2:5], s3 offset:-1 nv
-// CHECK: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dwordx4 off, v[2:5], s3 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x7d,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dwordx4 off, v[2:5], s3 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x7e,0xdc,0x00,0x02,0x83,0x00]
-
 scratch_load_ubyte_d16 v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4863,18 +4077,6 @@ scratch_load_ubyte_d16 v5, off, s2 offset:-1 glc
 scratch_load_ubyte_d16 v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x82,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_ubyte_d16 v5, off, s2 nv
-// CHECK: [0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ubyte_d16 v5, off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ubyte_d16 v5, off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x81,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ubyte_d16 v5, off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x82,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_ubyte_d16_hi v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4920,18 +4122,6 @@ scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 glc
 scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x86,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_ubyte_d16_hi v5, off, s2 nv
-// CHECK: [0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x85,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x86,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_sbyte_d16 v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4977,18 +4167,6 @@ scratch_load_sbyte_d16 v5, off, s2 offset:-1 glc
 scratch_load_sbyte_d16 v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x8a,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_sbyte_d16 v5, off, s2 nv
-// CHECK: [0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sbyte_d16 v5, off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sbyte_d16 v5, off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x89,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sbyte_d16 v5, off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x8a,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_sbyte_d16_hi v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x02,0x05]
 
@@ -5034,18 +4212,6 @@ scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 glc
 scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x8e,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_sbyte_d16_hi v5, off, s2 nv
-// CHECK: [0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x8d,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x8e,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_short_d16 v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x02,0x05]
 
@@ -5088,18 +4254,6 @@ scratch_load_short_d16 v5, off, s2 offset:-4096
 scratch_load_short_d16 v5, off, s2 offset:-1 glc
 // CHECK: [0xff,0x5f,0x91,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_short_d16 v5, off, s2 nv
-// CHECK: [0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_short_d16 v5, off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_short_d16 v5, off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x91,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_short_d16 v5, off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x92,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_short_d16 v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x92,0xdc,0x00,0x00,0x02,0x05]
 
@@ -5148,18 +4302,6 @@ scratch_load_short_d16_hi v5, off, s2 offset:-1 glc
 scratch_load_short_d16_hi v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x96,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_short_d16_hi v5, off, s2 nv
-// CHECK: [0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_short_d16_hi v5, off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_short_d16_hi v5, off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x95,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_short_d16_hi v5, off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x96,0xdc,0x00,0x00,0x82,0x05]
-
 global_load_dword v[2:3], off lds
 // CHECK: [0x00,0xa0,0x50,0xdc,0x02,0x00,0x7f,0x00]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_flat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_flat.txt
index 4c06585..0ee659e 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx9_flat.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_flat.txt
@@ -21,18 +21,6 @@
 # CHECK: flat_load_ubyte v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x42,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x42,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_ubyte v5, v[1:2] nv           ; encoding: [0x00,0x00,0x40,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x40,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ubyte v5, v[1:2] offset:7 nv  ; encoding: [0x07,0x00,0x40,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x40,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ubyte v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x41,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x41,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ubyte v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x42,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x42,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_sbyte v5, v[1:2] offset:4095  ; encoding: [0xff,0x0f,0x44,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x44,0xdc,0x01,0x00,0x00,0x05
 
@@ -54,18 +42,6 @@
 # CHECK: flat_load_sbyte v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x46,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x46,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_sbyte v5, v[1:2] nv           ; encoding: [0x00,0x00,0x44,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x44,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sbyte v5, v[1:2] offset:7 nv  ; encoding: [0x07,0x00,0x44,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x44,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sbyte v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x45,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x45,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sbyte v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x46,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x46,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_ushort v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x48,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x48,0xdc,0x01,0x00,0x00,0x05
 
@@ -87,18 +63,6 @@
 # CHECK: flat_load_ushort v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x4a,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x4a,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_ushort v5, v[1:2] nv          ; encoding: [0x00,0x00,0x48,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x48,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ushort v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x48,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x48,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ushort v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x49,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x49,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ushort v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x4a,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x4a,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_sshort v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x4c,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x4c,0xdc,0x01,0x00,0x00,0x05
 
@@ -120,18 +84,6 @@
 # CHECK: flat_load_sshort v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x4e,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x4e,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_sshort v5, v[1:2] nv          ; encoding: [0x00,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sshort v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sshort v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x4d,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x4d,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sshort v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x4e,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x4e,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_dword v5, v[1:2] offset:4095  ; encoding: [0xff,0x0f,0x50,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x50,0xdc,0x01,0x00,0x00,0x05
 
@@ -153,18 +105,6 @@
 # CHECK: flat_load_dword v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x52,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x52,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_dword v5, v[1:2] nv           ; encoding: [0x00,0x00,0x50,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x50,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dword v5, v[1:2] offset:7 nv  ; encoding: [0x07,0x00,0x50,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x50,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dword v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x51,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x51,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dword v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x52,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x52,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_dwordx2 v[5:6], v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x54,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x54,0xdc,0x01,0x00,0x00,0x05
 
@@ -186,18 +126,6 @@
 # CHECK: flat_load_dwordx2 v[5:6], v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x56,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x56,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_dwordx2 v[5:6], v[1:2] nv     ; encoding: [0x00,0x00,0x54,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x54,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dwordx2 v[5:6], v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x54,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x54,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dwordx2 v[5:6], v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x55,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x55,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dwordx2 v[5:6], v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x56,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x56,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_dwordx3 v[5:7], v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x58,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x58,0xdc,0x01,0x00,0x00,0x05
 
@@ -219,18 +147,6 @@
 # CHECK: flat_load_dwordx3 v[5:7], v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x5a,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x5a,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_dwordx3 v[5:7], v[1:2] nv     ; encoding: [0x00,0x00,0x58,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x58,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dwordx3 v[5:7], v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x58,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x58,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dwordx3 v[5:7], v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x59,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x59,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dwordx3 v[5:7], v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x5a,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x5a,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_dwordx4 v[5:8], v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x5c,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x5c,0xdc,0x01,0x00,0x00,0x05
 
@@ -252,18 +168,6 @@
 # CHECK: flat_load_dwordx4 v[5:8], v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x5e,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x5e,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_dwordx4 v[5:8], v[1:2] nv     ; encoding: [0x00,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dwordx4 v[5:8], v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dwordx4 v[5:8], v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x5d,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x5d,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dwordx4 v[5:8], v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x5e,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x5e,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_store_byte v[1:2], v2 offset:4095  ; encoding: [0xff,0x0f,0x60,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x60,0xdc,0x01,0x02,0x00,0x00
 
@@ -285,18 +189,6 @@
 # CHECK: flat_store_byte v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x62,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x62,0xdc,0x01,0x02,0x00,0x00
 
-# CHECK: flat_store_byte v[1:2], v2 nv           ; encoding: [0x00,0x00,0x60,0xdc,0x01,0x02,0x80,0x00]
-0x00,0x00,0x60,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_byte v[1:2], v2 offset:7 nv  ; encoding: [0x07,0x00,0x60,0xdc,0x01,0x02,0x80,0x00]
-0x07,0x00,0x60,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_byte v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x61,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x61,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_byte v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x62,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x62,0xdc,0x01,0x02,0x80,0x00
-
 # CHECK: flat_store_byte_d16_hi v[1:2], v2 offset:4095 ; encoding: [0xff,0x0f,0x64,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x64,0xdc,0x01,0x02,0x00,0x00
 
@@ -318,18 +210,6 @@
 # CHECK: flat_store_byte_d16_hi v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x66,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x66,0xdc,0x01,0x02,0x00,0x00
 
-# CHECK: flat_store_byte_d16_hi v[1:2], v2 nv    ; encoding: [0x00,0x00,0x64,0xdc,0x01,0x02,0x80,0x00]
-0x00,0x00,0x64,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_byte_d16_hi v[1:2], v2 offset:7 nv ; encoding: [0x07,0x00,0x64,0xdc,0x01,0x02,0x80,0x00]
-0x07,0x00,0x64,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_byte_d16_hi v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x65,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x65,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_byte_d16_hi v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x66,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x66,0xdc,0x01,0x02,0x80,0x00
-
 # CHECK: flat_store_short v[1:2], v2 offset:4095 ; encoding: [0xff,0x0f,0x68,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x68,0xdc,0x01,0x02,0x00,0x00
 
@@ -351,18 +231,6 @@
 # CHECK: flat_store_short v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x6a,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x6a,0xdc,0x01,0x02,0x00,0x00
 
-# CHECK: flat_store_short v[1:2], v2 nv          ; encoding: [0x00,0x00,0x68,0xdc,0x01,0x02,0x80,0x00]
-0x00,0x00,0x68,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_short v[1:2], v2 offset:7 nv ; encoding: [0x07,0x00,0x68,0xdc,0x01,0x02,0x80,0x00]
-0x07,0x00,0x68,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_short v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x69,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x69,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_short v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x6a,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x6a,0xdc,0x01,0x02,0x80,0x00
-
 # CHECK: flat_store_short_d16_hi v[1:2], v2 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x6c,0xdc,0x01,0x02,0x00,0x00
 
@@ -384,18 +252,6 @@
 # CHECK: flat_store_short_d16_hi v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x6e,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x6e,0xdc,0x01,0x02,0x00,0x00
 
-# CHECK: flat_store_short_d16_hi v[1:2], v2 nv   ; encoding: [0x00,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00]
-0x00,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_short_d16_hi v[1:2], v2 offset:7 nv ; encoding: [0x07,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00]
-0x07,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_short_d16_hi v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x6d,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x6d,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_short_d16_hi v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x6e,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x6e,0xdc,0x01,0x02,0x80,0x00
-
 # CHECK: flat_store_dword v[1:2], v2 offset:4095 ; encoding: [0xff,0x0f,0x70,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x70,0xdc,0x01,0x02,0x00,0x00
 
@@ -417,18 +273,6 @@
 # CHECK: flat_store_dword v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x72,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x72,0xdc,0x01,0x02,0x00,0x00
 
-# CHECK: flat_store_dword v[1:2], v2 nv          ; encoding: [0x00,0x00,0x70,0xdc,0x01,0x02,0x80,0x00]
-0x00,0x00,0x70,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dword v[1:2], v2 offset:7 nv ; encoding: [0x07,0x00,0x70,0xdc,0x01,0x02,0x80,0x00]
-0x07,0x00,0x70,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dword v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x71,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x71,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dword v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x72,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x72,0xdc,0x01,0x02,0x80,0x00
-
 # CHECK: flat_store_dwordx2 v[1:2], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x74,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x74,0xdc,0x01,0x02,0x00,0x00
 
@@ -450,18 +294,6 @@
 # CHECK: flat_store_dwordx2 v[1:2], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x76,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x76,0xdc,0x01,0x02,0x00,0x00
 
-# CHECK: flat_store_dwordx2 v[1:2], v[2:3] nv    ; encoding: [0x00,0x00,0x74,0xdc,0x01,0x02,0x80,0x00]
-0x00,0x00,0x74,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dwordx2 v[1:2], v[2:3] offset:7 nv ; encoding: [0x07,0x00,0x74,0xdc,0x01,0x02,0x80,0x00]
-0x07,0x00,0x74,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dwordx2 v[1:2], v[2:3] offset:4095 glc nv ; encoding: [0xff,0x0f,0x75,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x75,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dwordx2 v[1:2], v[2:3] offset:4095 slc nv ; encoding: [0xff,0x0f,0x76,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x76,0xdc,0x01,0x02,0x80,0x00
-
 # CHECK: flat_store_dwordx3 v[1:2], v[2:4] offset:4095 ; encoding: [0xff,0x0f,0x78,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x78,0xdc,0x01,0x02,0x00,0x00
 
@@ -483,18 +315,6 @@
 # CHECK: flat_store_dwordx3 v[1:2], v[2:4] offset:4095 slc ; encoding: [0xff,0x0f,0x7a,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x7a,0xdc,0x01,0x02,0x00,0x00
 
-# CHECK: flat_store_dwordx3 v[1:2], v[2:4] nv    ; encoding: [0x00,0x00,0x78,0xdc,0x01,0x02,0x80,0x00]
-0x00,0x00,0x78,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dwordx3 v[1:2], v[2:4] offset:7 nv ; encoding: [0x07,0x00,0x78,0xdc,0x01,0x02,0x80,0x00]
-0x07,0x00,0x78,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dwordx3 v[1:2], v[2:4] offset:4095 glc nv ; encoding: [0xff,0x0f,0x79,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x79,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dwordx3 v[1:2], v[2:4] offset:4095 slc nv ; encoding: [0xff,0x0f,0x7a,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x7a,0xdc,0x01,0x02,0x80,0x00
-
 # CHECK: flat_store_dwordx4 v[1:2], v[2:5] offset:4095 ; encoding: [0xff,0x0f,0x7c,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x7c,0xdc,0x01,0x02,0x00,0x00
 
@@ -516,18 +336,6 @@
 # CHECK: flat_store_dwordx4 v[1:2], v[2:5] offset:4095 slc ; encoding: [0xff,0x0f,0x7e,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x7e,0xdc,0x01,0x02,0x00,0x00
 
-# CHECK: flat_store_dwordx4 v[1:2], v[2:5] nv    ; encoding: [0x00,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00]
-0x00,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dwordx4 v[1:2], v[2:5] offset:7 nv ; encoding: [0x07,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00]
-0x07,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dwordx4 v[1:2], v[2:5] offset:4095 glc nv ; encoding: [0xff,0x0f,0x7d,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x7d,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dwordx4 v[1:2], v[2:5] offset:4095 slc nv ; encoding: [0xff,0x0f,0x7e,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x7e,0xdc,0x01,0x02,0x80,0x00
-
 # CHECK: flat_load_ubyte_d16 v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x80,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x80,0xdc,0x01,0x00,0x00,0x05
 
@@ -549,18 +357,6 @@
 # CHECK: flat_load_ubyte_d16 v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x82,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x82,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_ubyte_d16 v5, v[1:2] nv       ; encoding: [0x00,0x00,0x80,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x80,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ubyte_d16 v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x80,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x80,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ubyte_d16 v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x81,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x81,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ubyte_d16 v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x82,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x82,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x84,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x84,0xdc,0x01,0x00,0x00,0x05
 
@@ -582,18 +378,6 @@
 # CHECK: flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x86,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x86,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_ubyte_d16_hi v5, v[1:2] nv    ; encoding: [0x00,0x00,0x84,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x84,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ubyte_d16_hi v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x84,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x84,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x85,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x85,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x86,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x86,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_sbyte_d16 v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x88,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x88,0xdc,0x01,0x00,0x00,0x05
 
@@ -615,18 +399,6 @@
 # CHECK: flat_load_sbyte_d16 v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x8a,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x8a,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_sbyte_d16 v5, v[1:2] nv       ; encoding: [0x00,0x00,0x88,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x88,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sbyte_d16 v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x88,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x88,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sbyte_d16 v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x89,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x89,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sbyte_d16 v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x8a,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x8a,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x8c,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x8c,0xdc,0x01,0x00,0x00,0x05
 
@@ -648,18 +420,6 @@
 # CHECK: flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x8e,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x8e,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_sbyte_d16_hi v5, v[1:2] nv    ; encoding: [0x00,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sbyte_d16_hi v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x8d,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x8d,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x8e,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x8e,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_short_d16 v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x90,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x90,0xdc,0x01,0x00,0x00,0x05
 
@@ -681,18 +441,6 @@
 # CHECK: flat_load_short_d16 v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x92,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x92,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_short_d16 v5, v[1:2] nv       ; encoding: [0x00,0x00,0x90,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x90,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_short_d16 v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x90,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x90,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_short_d16 v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x91,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x91,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_short_d16 v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x92,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x92,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_short_d16_hi v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x94,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x94,0xdc,0x01,0x00,0x00,0x05
 
@@ -714,18 +462,6 @@
 # CHECK: flat_load_short_d16_hi v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x96,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x96,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_short_d16_hi v5, v[1:2] nv    ; encoding: [0x00,0x00,0x94,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x94,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_short_d16_hi v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x94,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x94,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_short_d16_hi v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x95,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x95,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_short_d16_hi v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x96,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x96,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_atomic_swap v[1:2], v2 offset:4095 ; encoding: [0xff,0x0f,0x00,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x00,0xdd,0x01,0x02,0x00,0x00
 
@@ -747,18 +483,6 @@
 # CHECK: flat_atomic_swap v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x02,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x02,0xdd,0x01,0x02,0x00,0x00
 
-# CHECK: flat_atomic_swap v[1:2], v2 nv          ; encoding: [0x00,0x00,0x00,0xdd,0x01,0x02,0x80,0x00]
-0x00,0x00,0x00,0xdd,0x01,0x02,0x80,0x00
-
-# CHECK: flat_atomic_swap v[1:2], v2 offset:7 nv ; encoding: [0x07,0x00,0x00,0xdd,0x01,0x02,0x80,0x00]
-0x07,0x00,0x00,0xdd,0x01,0x02,0x80,0x00
-
-# CHECK: flat_atomic_swap v0, v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x01,0xdd,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x01,0xdd,0x01,0x02,0x80,0x00
-
-# CHECK: flat_atomic_swap v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x02,0xdd,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x02,0xdd,0x01,0x02,0x80,0x00
-
 # CHECK: flat_atomic_cmpswap v[1:2], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x04,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x04,0xdd,0x01,0x02,0x00,0x00
 
@@ -780,18 +504,6 @@
 # CHECK: flat_atomic_cmpswap v[1:2], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x06,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x06,0xdd,0x01,0x02,0x00,0x00
 
-# CHECK: flat_atomic_cmpswap v[1:2], v[2:3] nv   ; encoding: [0x00,0x00,0x04,0xdd,0x01,0x02,0x80,0x00]
-0x00,0x00,0x04,0xdd,0x01,0x02,0x80,0x00
-
-# CHECK: flat_atomic_cmpswap v[1:2], v[2:3] offset:7 nv ; encoding: [0x07,0x00,0x04,0xdd,0x01,0x02,0x80,0x00]
-0x07,0x00,0x04,0xdd,0x01,0x02,0x80,0x00
-
-# CHECK: flat_atomic_cmpswap v0, v[1:2], v[2:3] offset:4095 glc nv ; encoding: [0xff,0x0f,0x05,0xdd,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x05,0xdd,0x01,0x02,0x80,0x00
-
-# CHECK: flat_atomic_cmpswap v[1:2], v[2:3] offset:4095 slc nv ; encoding: [0xff,0x0f,0x06,0xdd,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x06,0xdd,0x01,0x02,0x80,0x00
-
 # CHECK: flat_atomic_add v[1:2], v2 offset:4095  ; encoding: [0xff,0x0f,0x08,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x08,0xdd,0x01,0x02,0x00,0x00
 
@@ -813,18 +525,6 @@
 # CHECK: flat_atomic_add v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x0a,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x0a,0xdd,0x01,0x02,0x00,0x00
 
-# CHECK: flat_atomic_add v[1:2], v2 nv           ; encoding: [0x00,0x00,0x08,0xdd,0x01,0x02,0x80,0x00]
-0x00,0x00,0x08,0xdd,0x01,0x02,0x80,0x00
-
-# CHECK: flat_atomic_add v[1:2], v2 offset:7 nv  ; encoding: [0x07,0x00,0x08,0xdd,0x01,0x02,0x80,0x00]
-0x07,0x00,0x08,0xdd,0x01,0x02,0x80,0x00
-
-# CHECK: flat_atomic_add v0, v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x09,0xdd,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x09,0xdd,0x01,0x02,0x80,0x00
-
-# CHECK: flat_atomic_add v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x0a,0xdd,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x0a,0xdd,0x01,0x02,0x80,0x00
-
 # CHECK: flat_atomic_sub v[1:2], v2 offset:4095  ; encoding: [0xff,0x0f,0x0c,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x0c,0xdd,0x01,0x02,0x00,0x00
 
@@ -1317,18 +1017,6 @@
 # CHECK: global_load_ubyte v5, v[1:2], off       ; encoding: [0x00,0x80,0x40,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x40,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_ubyte v5, v1, s[4:5] nv     ; encoding: [0x00,0x80,0x40,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x40,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ubyte v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x40,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x40,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ubyte v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x41,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x41,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ubyte v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x42,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x42,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_sbyte v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x44,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x44,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1338,18 +1026,6 @@
 # CHECK: global_load_sbyte v5, v[1:2], off       ; encoding: [0x00,0x80,0x44,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x44,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_sbyte v5, v1, s[4:5] nv     ; encoding: [0x00,0x80,0x44,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x44,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sbyte v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x44,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x44,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sbyte v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x45,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x45,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sbyte v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x46,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x46,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_ushort v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x48,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x48,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1359,18 +1035,6 @@
 # CHECK: global_load_ushort v5, v[1:2], off      ; encoding: [0x00,0x80,0x48,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x48,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_ushort v5, v1, s[4:5] nv    ; encoding: [0x00,0x80,0x48,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x48,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ushort v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x48,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x48,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ushort v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x49,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x49,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ushort v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x4a,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x4a,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_sshort v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x4c,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x4c,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1380,18 +1044,6 @@
 # CHECK: global_load_sshort v5, v[1:2], off      ; encoding: [0x00,0x80,0x4c,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x4c,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_sshort v5, v1, s[4:5] nv    ; encoding: [0x00,0x80,0x4c,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x4c,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sshort v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x4c,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x4c,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sshort v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x4d,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x4d,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sshort v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x4e,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x4e,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_dword v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x50,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x50,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1401,18 +1053,6 @@
 # CHECK: global_load_dword v5, v[1:2], off       ; encoding: [0x00,0x80,0x50,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x50,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_dword v5, v1, s[4:5] nv     ; encoding: [0x00,0x80,0x50,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x50,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dword v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x50,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x50,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dword v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x51,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x51,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dword v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x52,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x52,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_dwordx2 v[5:6], v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x54,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x54,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1422,18 +1062,6 @@
 # CHECK: global_load_dwordx2 v[5:6], v[1:2], off ; encoding: [0x00,0x80,0x54,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x54,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_dwordx2 v[5:6], v1, s[4:5] nv ; encoding: [0x00,0x80,0x54,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x54,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x54,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x54,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x55,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x55,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x56,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x56,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_dwordx3 v[5:7], v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x58,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x58,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1443,18 +1071,6 @@
 # CHECK: global_load_dwordx3 v[5:7], v[1:2], off ; encoding: [0x00,0x80,0x58,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x58,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_dwordx3 v[5:7], v1, s[4:5] nv ; encoding: [0x00,0x80,0x58,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x58,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x58,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x58,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x59,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x59,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x5a,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x5a,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_dwordx4 v[5:8], v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x5c,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x5c,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1464,18 +1080,6 @@
 # CHECK: global_load_dwordx4 v[5:8], v[1:2], off ; encoding: [0x00,0x80,0x5c,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x5c,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_dwordx4 v[5:8], v1, s[4:5] nv ; encoding: [0x00,0x80,0x5c,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x5c,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x5c,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x5c,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x5d,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x5d,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x5e,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x5e,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_store_byte v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x60,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x60,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1485,18 +1089,6 @@
 # CHECK: global_store_byte v[1:2], v2, off       ; encoding: [0x00,0x80,0x60,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x60,0xdc,0x01,0x02,0x7f,0x00
 
-# CHECK: global_store_byte v1, v2, s[6:7] nv     ; encoding: [0x00,0x80,0x60,0xdc,0x01,0x02,0x86,0x00]
-0x00,0x80,0x60,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_byte v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x60,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x60,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_byte v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x61,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x61,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_byte v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x62,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x62,0xdc,0x01,0x02,0x86,0x00
-
 # CHECK: global_store_byte_d16_hi v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x64,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x64,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1506,18 +1098,6 @@
 # CHECK: global_store_byte_d16_hi v[1:2], v2, off ; encoding: [0x00,0x80,0x64,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x64,0xdc,0x01,0x02,0x7f,0x00
 
-# CHECK: global_store_byte_d16_hi v1, v2, s[6:7] nv ; encoding: [0x00,0x80,0x64,0xdc,0x01,0x02,0x86,0x00]
-0x00,0x80,0x64,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x64,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x64,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x65,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x65,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x66,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x66,0xdc,0x01,0x02,0x86,0x00
-
 # CHECK: global_store_short v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x68,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x68,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1527,18 +1107,6 @@
 # CHECK: global_store_short v[1:2], v2, off      ; encoding: [0x00,0x80,0x68,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x68,0xdc,0x01,0x02,0x7f,0x00
 
-# CHECK: global_store_short v1, v2, s[6:7] nv    ; encoding: [0x00,0x80,0x68,0xdc,0x01,0x02,0x86,0x00]
-0x00,0x80,0x68,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_short v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x68,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x68,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_short v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x69,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x69,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_short v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x6a,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x6a,0xdc,0x01,0x02,0x86,0x00
-
 # CHECK: global_store_short_d16_hi v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x6c,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x6c,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1548,18 +1116,6 @@
 # CHECK: global_store_short_d16_hi v[1:2], v2, off ; encoding: [0x00,0x80,0x6c,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x6c,0xdc,0x01,0x02,0x7f,0x00
 
-# CHECK: global_store_short_d16_hi v1, v2, s[6:7] nv ; encoding: [0x00,0x80,0x6c,0xdc,0x01,0x02,0x86,0x00]
-0x00,0x80,0x6c,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_short_d16_hi v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x6c,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x6c,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_short_d16_hi v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x6d,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x6d,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_short_d16_hi v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x6e,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x6e,0xdc,0x01,0x02,0x86,0x00
-
 # CHECK: global_store_dword v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x70,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x70,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1569,18 +1125,6 @@
 # CHECK: global_store_dword v[1:2], v2, off      ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x70,0xdc,0x01,0x02,0x7f,0x00
 
-# CHECK: global_store_dword v1, v2, s[6:7] nv    ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x02,0x86,0x00]
-0x00,0x80,0x70,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dword v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x70,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x70,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dword v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x71,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x71,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dword v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x72,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x72,0xdc,0x01,0x02,0x86,0x00
-
 # CHECK: global_store_dwordx2 v[1:2], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x74,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x74,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1590,18 +1134,6 @@
 # CHECK: global_store_dwordx2 v[1:2], v[2:3], off ; encoding: [0x00,0x80,0x74,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x74,0xdc,0x01,0x02,0x7f,0x00
 
-# CHECK: global_store_dwordx2 v1, v[2:3], s[6:7] nv ; encoding: [0x00,0x80,0x74,0xdc,0x01,0x02,0x86,0x00]
-0x00,0x80,0x74,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x74,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x74,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x75,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x75,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x76,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x76,0xdc,0x01,0x02,0x86,0x00
-
 # CHECK: global_store_dwordx3 v[1:2], v[2:4], off offset:-1 ; encoding: [0xff,0x9f,0x78,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x78,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1611,18 +1143,6 @@
 # CHECK: global_store_dwordx3 v[1:2], v[2:4], off ; encoding: [0x00,0x80,0x78,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x78,0xdc,0x01,0x02,0x7f,0x00
 
-# CHECK: global_store_dwordx3 v1, v[2:4], s[6:7] nv ; encoding: [0x00,0x80,0x78,0xdc,0x01,0x02,0x86,0x00]
-0x00,0x80,0x78,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x78,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x78,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x79,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x79,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x7a,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x7a,0xdc,0x01,0x02,0x86,0x00
-
 # CHECK: global_store_dwordx4 v[1:2], v[2:5], off offset:-1 ; encoding: [0xff,0x9f,0x7c,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x7c,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1632,18 +1152,6 @@
 # CHECK: global_store_dwordx4 v[1:2], v[2:5], off ; encoding: [0x00,0x80,0x7c,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x7c,0xdc,0x01,0x02,0x7f,0x00
 
-# CHECK: global_store_dwordx4 v1, v[2:5], s[6:7] nv ; encoding: [0x00,0x80,0x7c,0xdc,0x01,0x02,0x86,0x00]
-0x00,0x80,0x7c,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x7c,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x7c,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x7d,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x7d,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x7e,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x7e,0xdc,0x01,0x02,0x86,0x00
-
 # CHECK: global_load_ubyte_d16 v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x80,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x80,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1653,18 +1161,6 @@
 # CHECK: global_load_ubyte_d16 v5, v[1:2], off   ; encoding: [0x00,0x80,0x80,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x80,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_ubyte_d16 v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x80,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x80,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x80,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x80,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x81,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x81,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x82,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x82,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_ubyte_d16_hi v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x84,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x84,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1674,18 +1170,6 @@
 # CHECK: global_load_ubyte_d16_hi v5, v[1:2], off ; encoding: [0x00,0x80,0x84,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x84,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_ubyte_d16_hi v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x84,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x84,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x84,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x84,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x85,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x85,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x86,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x86,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_sbyte_d16 v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x88,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x88,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1695,18 +1179,6 @@
 # CHECK: global_load_sbyte_d16 v5, v[1:2], off   ; encoding: [0x00,0x80,0x88,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x88,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_sbyte_d16 v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x88,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x88,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x88,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x88,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x89,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x89,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x8a,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x8a,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_sbyte_d16_hi v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x8c,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x8c,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1716,18 +1188,6 @@
 # CHECK: global_load_sbyte_d16_hi v5, v[1:2], off ; encoding: [0x00,0x80,0x8c,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x8c,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_sbyte_d16_hi v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x8c,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x8c,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x8c,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x8c,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x8d,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x8d,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x8e,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x8e,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_short_d16 v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x90,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x90,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1737,18 +1197,6 @@
 # CHECK: global_load_short_d16 v5, v[1:2], off   ; encoding: [0x00,0x80,0x90,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x90,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_short_d16 v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x90,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x90,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_short_d16 v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x90,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x90,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_short_d16 v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x91,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x91,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_short_d16 v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x92,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x92,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_short_d16_hi v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x94,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x94,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1758,18 +1206,6 @@
 # CHECK: global_load_short_d16_hi v5, v[1:2], off ; encoding: [0x00,0x80,0x94,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x94,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_short_d16_hi v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x94,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x94,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_short_d16_hi v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x94,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x94,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_short_d16_hi v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x95,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x95,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_short_d16_hi v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x96,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x96,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_atomic_swap v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x00,0xdd,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x00,0xdd,0x01,0x02,0x7f,0x00
 
@@ -1779,18 +1215,6 @@
 # CHECK: global_atomic_swap v[1:2], v2, off      ; encoding: [0x00,0x80,0x00,0xdd,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x00,0xdd,0x01,0x02,0x7f,0x00
 
-# CHECK: global_atomic_swap v1, v2, s[6:7] nv    ; encoding: [0x00,0x80,0x00,0xdd,0x01,0x02,0x86,0x00]
-0x00,0x80,0x00,0xdd,0x01,0x02,0x86,0x00
-
-# CHECK: global_atomic_swap v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x00,0xdd,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x00,0xdd,0x01,0x02,0x86,0x00
-
-# CHECK: global_atomic_swap v0, v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x01,0xdd,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x01,0xdd,0x01,0x02,0x86,0x00
-
-# CHECK: global_atomic_swap v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x02,0xdd,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x02,0xdd,0x01,0x02,0x86,0x00
-
 # CHECK: global_atomic_cmpswap v[1:2], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x04,0xdd,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x04,0xdd,0x01,0x02,0x7f,0x00
 
@@ -1812,18 +1236,6 @@
 # CHECK: global_atomic_cmpswap v1, v[2:3], v[4:5], off glc ; encoding: [0x00,0x80,0x05,0xdd,0x02,0x04,0x7f,0x01]
 0x00,0x80,0x05,0xdd,0x02,0x04,0x7f,0x01
 
-# CHECK: global_atomic_cmpswap v1, v[2:3], s[6:7] nv ; encoding: [0x00,0x80,0x04,0xdd,0x01,0x02,0x86,0x00]
-0x00,0x80,0x04,0xdd,0x01,0x02,0x86,0x00
-
-# CHECK: global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x04,0xdd,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x04,0xdd,0x01,0x02,0x86,0x00
-
-# CHECK: global_atomic_cmpswap v0, v1, v[2:3], s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x05,0xdd,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x05,0xdd,0x01,0x02,0x86,0x00
-
-# CHECK: global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x06,0xdd,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x06,0xdd,0x01,0x02,0x86,0x00
-
 # CHECK: global_atomic_add v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x08,0xdd,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x08,0xdd,0x01,0x02,0x7f,0x00
 
@@ -1833,18 +1245,6 @@
 # CHECK: global_atomic_add v[1:2], v2, off       ; encoding: [0x00,0x80,0x08,0xdd,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x08,0xdd,0x01,0x02,0x7f,0x00
 
-# CHECK: global_atomic_add v1, v2, s[6:7] nv     ; encoding: [0x00,0x80,0x08,0xdd,0x01,0x02,0x86,0x00]
-0x00,0x80,0x08,0xdd,0x01,0x02,0x86,0x00
-
-# CHECK: global_atomic_add v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x08,0xdd,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x08,0xdd,0x01,0x02,0x86,0x00
-
-# CHECK: global_atomic_add v0, v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x09,0xdd,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x09,0xdd,0x01,0x02,0x86,0x00
-
-# CHECK: global_atomic_add v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x0a,0xdd,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x0a,0xdd,0x01,0x02,0x86,0x00
-
 # CHECK: global_atomic_sub v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x0c,0xdd,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x0c,0xdd,0x01,0x02,0x7f,0x00
 
@@ -2103,18 +1503,6 @@
 # CHECK: scratch_load_ubyte v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x42,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x42,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_ubyte v5, off, s2 nv       ; encoding: [0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ubyte v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ubyte v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x41,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x41,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ubyte v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x42,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x42,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_sbyte v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x44,0xdc,0x00,0x00,0x02,0x05
 
@@ -2154,18 +1542,6 @@
 # CHECK: scratch_load_sbyte v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x46,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x46,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_sbyte v5, off, s2 nv       ; encoding: [0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sbyte v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sbyte v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x45,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x45,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sbyte v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x46,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x46,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_ushort v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x48,0xdc,0x00,0x00,0x02,0x05
 
@@ -2205,18 +1581,6 @@
 # CHECK: scratch_load_ushort v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x4a,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x4a,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_ushort v5, off, s2 nv      ; encoding: [0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ushort v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ushort v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x49,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x49,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ushort v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x4a,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x4a,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_sshort v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x4c,0xdc,0x00,0x00,0x02,0x05
 
@@ -2256,18 +1620,6 @@
 # CHECK: scratch_load_sshort v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x4e,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x4e,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_sshort v5, off, s2 nv      ; encoding: [0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sshort v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sshort v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x4d,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x4d,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sshort v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x4e,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x4e,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_dword v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x50,0xdc,0x00,0x00,0x02,0x05
 
@@ -2307,18 +1659,6 @@
 # CHECK: scratch_load_dword v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x52,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x52,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_dword v5, off, s2 nv       ; encoding: [0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dword v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dword v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x51,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x51,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dword v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x52,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x52,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_dwordx2 v[5:6], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x54,0xdc,0x00,0x00,0x02,0x05
 
@@ -2358,18 +1698,6 @@
 # CHECK: scratch_load_dwordx2 v[5:6], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x56,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x56,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_dwordx2 v[5:6], off, s2 nv ; encoding: [0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dwordx2 v[5:6], off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dwordx2 v[5:6], off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x55,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x55,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dwordx2 v[5:6], off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x56,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x56,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_dwordx3 v[5:7], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x58,0xdc,0x00,0x00,0x02,0x05
 
@@ -2409,18 +1737,6 @@
 # CHECK: scratch_load_dwordx3 v[5:7], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x5a,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x5a,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_dwordx3 v[5:7], off, s2 nv ; encoding: [0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dwordx3 v[5:7], off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dwordx3 v[5:7], off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x59,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x59,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dwordx3 v[5:7], off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x5a,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x5a,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_dwordx4 v[5:8], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x5c,0xdc,0x00,0x00,0x02,0x05
 
@@ -2460,18 +1776,6 @@
 # CHECK: scratch_load_dwordx4 v[5:8], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x5e,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x5e,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_dwordx4 v[5:8], off, s2 nv ; encoding: [0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dwordx4 v[5:8], off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dwordx4 v[5:8], off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x5d,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x5d,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dwordx4 v[5:8], off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x5e,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x5e,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_store_byte off, v2, s3 offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x60,0xdc,0x00,0x02,0x03,0x00
 
@@ -2511,18 +1815,6 @@
 # CHECK: scratch_store_byte off, v2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x62,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x62,0xdc,0x00,0x02,0x03,0x00
 
-# CHECK: scratch_store_byte off, v2, s3 nv       ; encoding: [0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00]
-0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_byte off, v2, s3 offset:-1 nv ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x60,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_byte off, v2, s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x61,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x61,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_byte off, v2, s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x62,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x62,0xdc,0x00,0x02,0x83,0x00
-
 # CHECK: scratch_store_byte_d16_hi off, v2, s3 offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x64,0xdc,0x00,0x02,0x03,0x00
 
@@ -2562,18 +1854,6 @@
 # CHECK: scratch_store_byte_d16_hi off, v2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x66,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x66,0xdc,0x00,0x02,0x03,0x00
 
-# CHECK: scratch_store_byte_d16_hi off, v2, s3 nv ; encoding: [0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00]
-0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_byte_d16_hi off, v2, s3 offset:-1 nv ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x64,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_byte_d16_hi off, v2, s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x65,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x65,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_byte_d16_hi off, v2, s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x66,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x66,0xdc,0x00,0x02,0x83,0x00
-
 # CHECK: scratch_store_short off, v2, s3 offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x68,0xdc,0x00,0x02,0x03,0x00
 
@@ -2613,18 +1893,6 @@
 # CHECK: scratch_store_short off, v2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x6a,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x6a,0xdc,0x00,0x02,0x03,0x00
 
-# CHECK: scratch_store_short off, v2, s3 nv      ; encoding: [0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00]
-0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_short off, v2, s3 offset:-1 nv ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x68,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_short off, v2, s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x69,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x69,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_short off, v2, s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x6a,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x6a,0xdc,0x00,0x02,0x83,0x00
-
 # CHECK: scratch_store_short_d16_hi off, v2, s3 offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x6c,0xdc,0x00,0x02,0x03,0x00
 
@@ -2664,18 +1932,6 @@
 # CHECK: scratch_store_short_d16_hi off, v2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x6e,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x6e,0xdc,0x00,0x02,0x03,0x00
 
-# CHECK: scratch_store_short_d16_hi off, v2, s3 nv ; encoding: [0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00]
-0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_short_d16_hi off, v2, s3 offset:-1 nv ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x6c,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_short_d16_hi off, v2, s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x6d,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x6d,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_short_d16_hi off, v2, s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x6e,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x6e,0xdc,0x00,0x02,0x83,0x00
-
 # CHECK: scratch_store_dword off, v2, s3 offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x70,0xdc,0x00,0x02,0x03,0x00
 
@@ -2715,18 +1971,6 @@
 # CHECK: scratch_store_dword off, v2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x72,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x72,0xdc,0x00,0x02,0x03,0x00
 
-# CHECK: scratch_store_dword off, v2, s3 nv      ; encoding: [0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00]
-0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dword off, v2, s3 offset:-1 nv ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x70,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dword off, v2, s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x71,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x71,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dword off, v2, s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x72,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x72,0xdc,0x00,0x02,0x83,0x00
-
 # CHECK: scratch_store_dwordx2 off, v[2:3], s3 offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x74,0xdc,0x00,0x02,0x03,0x00
 
@@ -2766,18 +2010,6 @@
 # CHECK: scratch_store_dwordx2 off, v[2:3], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x76,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x76,0xdc,0x00,0x02,0x03,0x00
 
-# CHECK: scratch_store_dwordx2 off, v[2:3], s3 nv ; encoding: [0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00]
-0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dwordx2 off, v[2:3], s3 offset:-1 nv ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x74,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dwordx2 off, v[2:3], s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x75,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x75,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dwordx2 off, v[2:3], s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x76,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x76,0xdc,0x00,0x02,0x83,0x00
-
 # CHECK: scratch_store_dwordx3 off, v[2:4], s3 offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x78,0xdc,0x00,0x02,0x03,0x00
 
@@ -2817,18 +2049,6 @@
 # CHECK: scratch_store_dwordx3 off, v[2:4], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x7a,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x7a,0xdc,0x00,0x02,0x03,0x00
 
-# CHECK: scratch_store_dwordx3 off, v[2:4], s3 nv ; encoding: [0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00]
-0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dwordx3 off, v[2:4], s3 offset:-1 nv ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x78,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dwordx3 off, v[2:4], s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x79,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x79,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dwordx3 off, v[2:4], s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x7a,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x7a,0xdc,0x00,0x02,0x83,0x00
-
 # CHECK: scratch_store_dwordx4 off, v[2:5], s3 offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x7c,0xdc,0x00,0x02,0x03,0x00
 
@@ -2868,18 +2088,6 @@
 # CHECK: scratch_store_dwordx4 off, v[2:5], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x7e,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x7e,0xdc,0x00,0x02,0x03,0x00
 
-# CHECK: scratch_store_dwordx4 off, v[2:5], s3 nv ; encoding: [0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00]
-0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dwordx4 off, v[2:5], s3 offset:-1 nv ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x7c,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dwordx4 off, v[2:5], s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x7d,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x7d,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dwordx4 off, v[2:5], s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x7e,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x7e,0xdc,0x00,0x02,0x83,0x00
-
 # CHECK: scratch_load_ubyte_d16 v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x80,0xdc,0x00,0x00,0x02,0x05
 
@@ -2919,18 +2127,6 @@
 # CHECK: scratch_load_ubyte_d16 v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x82,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x82,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_ubyte_d16 v5, off, s2 nv   ; encoding: [0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ubyte_d16 v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ubyte_d16 v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x81,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x81,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ubyte_d16 v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x82,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x82,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x84,0xdc,0x00,0x00,0x02,0x05
 
@@ -2970,18 +2166,6 @@
 # CHECK: scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x86,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x86,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_ubyte_d16_hi v5, off, s2 nv ; encoding: [0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x85,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x85,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x86,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x86,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_sbyte_d16 v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x88,0xdc,0x00,0x00,0x02,0x05
 
@@ -3021,18 +2205,6 @@
 # CHECK: scratch_load_sbyte_d16 v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x8a,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x8a,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_sbyte_d16 v5, off, s2 nv   ; encoding: [0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sbyte_d16 v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sbyte_d16 v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x89,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x89,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sbyte_d16 v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x8a,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x8a,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x8c,0xdc,0x00,0x00,0x02,0x05
 
@@ -3072,18 +2244,6 @@
 # CHECK: scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x8e,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x8e,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_sbyte_d16_hi v5, off, s2 nv ; encoding: [0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x8d,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x8d,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x8e,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x8e,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_short_d16 v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x90,0xdc,0x00,0x00,0x02,0x05
 
@@ -3123,18 +2283,6 @@
 # CHECK: scratch_load_short_d16 v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x92,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x92,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_short_d16 v5, off, s2 nv   ; encoding: [0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_short_d16 v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_short_d16 v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x91,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x91,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_short_d16 v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x92,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x92,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_short_d16_hi v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x94,0xdc,0x00,0x00,0x02,0x05
 
@@ -3174,18 +2322,6 @@
 # CHECK: scratch_load_short_d16_hi v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x96,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x96,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_short_d16_hi v5, off, s2 nv ; encoding: [0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_short_d16_hi v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_short_d16_hi v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x95,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x95,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_short_d16_hi v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x96,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x96,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: global_load_dword v[2:3], off lds       ; encoding: [0x00,0xa0,0x50,0xdc,0x02,0x00,0x7f,0x00]
 0x00,0xa0,0x50,0xdc,0x02,0x00,0x7f,0x00
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll
deleted file mode 100644
index bd5f4e2..0000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll
+++ /dev/null
@@ -1,117 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -passes=loop-vectorize -enable-interleaved-mem-accesses=true -max-interleave-group-factor=16  -S < %s | FileCheck %s
-
-define dso_local void @_Z6unpackPhS_(ptr noalias noundef readonly captures(none) %in, ptr noalias noundef writeonly captures(none) %out) {
-; CHECK-LABEL: define dso_local void @_Z6unpackPhS_(
-; CHECK-SAME: ptr noalias noundef readonly captures(none) [[IN:%.*]], ptr noalias noundef writeonly captures(none) [[OUT:%.*]]) {
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[OUT]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 4
-; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[IN]], i64 [[OFFSET_IDX2]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i8>, ptr [[NEXT_GEP3]], align 1, !alias.scope [[META0:![0-9]+]]
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[STRIDED_VEC5]]
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[STRIDED_VEC4]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i8> [[STRIDED_VEC5]], [[STRIDED_VEC4]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i8> [[STRIDED_VEC5]], [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i8> [[STRIDED_VEC4]], [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i8> [[STRIDED_VEC5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i8> zeroinitializer, <4 x i8> [[STRIDED_VEC6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i8> [[STRIDED_VEC5]], <4 x i8> [[TMP0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[STRIDED_VEC4]], <4 x i8> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[STRIDED_VEC]], <4 x i8> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i8> [[TMP11]], <8 x i8> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x i8> [[TMP13]], <8 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP15]], <8 x i8> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x i8> [[TMP17]], <8 x i8> [[TMP18]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <16 x i8> [[TMP19]], <16 x i8> [[TMP20]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP21]], <16 x i8> [[TMP22]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <32 x i8> [[TMP23]], <32 x i8> [[TMP24]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <64 x i8> [[TMP25]], <64 x i8> poison, <64 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
-; CHECK-NEXT:    store <64 x i8> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %vector.body, !llvm.loop [[LOOP5:![0-9]+]]
-;
-entry:
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.body
-  ret void
-
-for.body:                                         ; preds = %entry, %for.body
-  %i.033 = phi i32 [ 0, %entry ], [ %inc17, %for.body ]
-  %out.addr.032 = phi ptr [ %out, %entry ], [ %add.ptr, %for.body ]
-  %in.addr.031 = phi ptr [ %in, %entry ], [ %add.ptr15, %for.body ]
-  store i8 0, ptr %out.addr.032, align 1
-  %arrayidx10 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 3
-  %0 = load i8, ptr %arrayidx10, align 1
-  %arrayidx14 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 1
-  store i8 %0, ptr %arrayidx14, align 1
-  %arrayidx10.1 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 2
-  %1 = load i8, ptr %arrayidx10.1, align 1
-  %arrayidx14.1 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 2
-  store i8 %1, ptr %arrayidx14.1, align 1
-  %add.2 = add i8 %0, %1
-  %arrayidx14.2 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 3
-  store i8 %add.2, ptr %arrayidx14.2, align 1
-  %arrayidx10.3 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 1
-  %2 = load i8, ptr %arrayidx10.3, align 1
-  %arrayidx14.3 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 4
-  store i8 %2, ptr %arrayidx14.3, align 1
-  %add.4 = add i8 %0, %2
-  %arrayidx14.4 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 5
-  store i8 %add.4, ptr %arrayidx14.4, align 1
-  %add.5 = add i8 %1, %2
-  %arrayidx14.5 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 6
-  store i8 %add.5, ptr %arrayidx14.5, align 1
-  %add.6 = add i8 %0, %add.5
-  %arrayidx14.6 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 7
-  store i8 %add.6, ptr %arrayidx14.6, align 1
-  %3 = load i8, ptr %in.addr.031, align 1
-  %arrayidx14.7 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 8
-  store i8 %3, ptr %arrayidx14.7, align 1
-  %add.8 = add i8 %0, %3
-  %arrayidx14.8 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 9
-  store i8 %add.8, ptr %arrayidx14.8, align 1
-  %add.9 = add i8 %1, %3
-  %arrayidx14.9 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 10
-  store i8 %add.9, ptr %arrayidx14.9, align 1
-  %add.10 = add i8 %0, %add.9
-  %arrayidx14.10 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 11
-  store i8 %add.10, ptr %arrayidx14.10, align 1
-  %add.11 = add i8 %2, %3
-  %arrayidx14.11 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 12
-  store i8 %add.11, ptr %arrayidx14.11, align 1
-  %add.12 = add i8 %0, %add.11
-  %arrayidx14.12 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 13
-  store i8 %add.12, ptr %arrayidx14.12, align 1
-  %add.13 = add i8 %1, %add.11
-  %arrayidx14.13 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 14
-  store i8 %add.13, ptr %arrayidx14.13, align 1
-  %add.14 = add i8 %0, %add.13
-  %arrayidx14.14 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 15
-  store i8 %add.14, ptr %arrayidx14.14, align 1
-  %add.ptr = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 16
-  %add.ptr15 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 4
-  %inc17 = add nuw nsw i32 %i.033, 1
-  %exitcond.not = icmp eq i32 %inc17, 32
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
-}
-
-!0 = distinct !{!0, !1}
-!1 = !{!"llvm.loop.mustprogress"}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked_ldst_sme.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked_ldst_sme.ll
new file mode 100644
index 0000000..fe7f43f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked_ldst_sme.ll
@@ -0,0 +1,187 @@
+; RUN: opt < %s -passes=loop-vectorize -S | FileCheck %s
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @wombat(i32 %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, i8 %arg6) #0 {
+; CHECK-LABEL: define void @wombat(
+; CHECK-SAME: i32 [[ARG:%.*]], ptr [[ARG1:%.*]], ptr [[ARG2:%.*]], ptr [[ARG3:%.*]], ptr [[ARG4:%.*]], ptr [[ARG5:%.*]], i8 [[ARG6:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp sgt i32 [[ARG]], 0
+; CHECK-NEXT:    br i1 [[ICMP]], label %[[BB7:.*]], label %[[BB25:.*]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext nneg i32 [[ARG]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[ARG1]], i64 [[ZEXT]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[ARG2]], i64 [[ZEXT]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[ARG5]], i64 [[ZEXT]]
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[ARG3]], i64 [[ZEXT]]
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[ARG4]], i64 [[ZEXT]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[ARG1]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[ARG2]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND05:%.*]] = icmp ult ptr [[ARG1]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND16:%.*]] = icmp ult ptr [[ARG5]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT7:%.*]] = and i1 [[BOUND05]], [[BOUND16]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT7]]
+; CHECK-NEXT:    [[BOUND08:%.*]] = icmp ult ptr [[ARG1]], [[SCEVGEP3]]
+; CHECK-NEXT:    [[BOUND19:%.*]] = icmp ult ptr [[ARG3]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]]
+; CHECK-NEXT:    [[CONFLICT_RDX11:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT10]]
+; CHECK-NEXT:    [[BOUND012:%.*]] = icmp ult ptr [[ARG1]], [[SCEVGEP4]]
+; CHECK-NEXT:    [[BOUND113:%.*]] = icmp ult ptr [[ARG4]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
+; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX11]], [[FOUND_CONFLICT14]]
+; CHECK-NEXT:    [[BOUND016:%.*]] = icmp ult ptr [[ARG2]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND117:%.*]] = icmp ult ptr [[ARG5]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; CHECK-NEXT:    [[CONFLICT_RDX19:%.*]] = or i1 [[CONFLICT_RDX15]], [[FOUND_CONFLICT18]]
+; CHECK-NEXT:    [[BOUND020:%.*]] = icmp ult ptr [[ARG2]], [[SCEVGEP3]]
+; CHECK-NEXT:    [[BOUND121:%.*]] = icmp ult ptr [[ARG3]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT22:%.*]] = and i1 [[BOUND020]], [[BOUND121]]
+; CHECK-NEXT:    [[CONFLICT_RDX23:%.*]] = or i1 [[CONFLICT_RDX19]], [[FOUND_CONFLICT22]]
+; CHECK-NEXT:    [[BOUND024:%.*]] = icmp ult ptr [[ARG2]], [[SCEVGEP4]]
+; CHECK-NEXT:    [[BOUND125:%.*]] = icmp ult ptr [[ARG4]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT26:%.*]] = and i1 [[BOUND024]], [[BOUND125]]
+; CHECK-NEXT:    [[CONFLICT_RDX27:%.*]] = or i1 [[CONFLICT_RDX23]], [[FOUND_CONFLICT26]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX27]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[ARG6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG5]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP4]], align 1, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp uge <vscale x 16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[ARG1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP6]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i8> poison), !alias.scope [[META3:![0-9]+]], !noalias [[META5:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[ARG3]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD28:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP7]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i8> poison), !alias.scope [[META9:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[ARG4]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD29:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP8]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i8> poison), !alias.scope [[META10:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD29]], [[WIDE_MASKED_LOAD28]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 16 x i8> [[TMP9]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP10]], ptr align 1 [[TMP6]], <vscale x 16 x i1> [[TMP5]]), !alias.scope [[META3]], !noalias [[META5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[ARG2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD30:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i8> poison), !alias.scope [[META11:![0-9]+]], !noalias [[META12:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD28]], [[WIDE_MASKED_LOAD28]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD30]], [[TMP12]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP13]], ptr align 1 [[TMP11]], <vscale x 16 x i1> [[TMP5]]), !alias.scope [[META11]], !noalias [[META12]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[BB24:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[BB7]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label %[[BB8:.*]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD22:%.*]], %[[BB21:.*]] ]
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG5]], i64 [[PHI]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[GETELEMENTPTR]], align 1
+; CHECK-NEXT:    [[ICMP9:%.*]] = icmp ult i8 [[LOAD]], [[ARG6]]
+; CHECK-NEXT:    br i1 [[ICMP9]], label %[[BB21]], label %[[BB10:.*]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[GETELEMENTPTR11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG1]], i64 [[PHI]]
+; CHECK-NEXT:    [[LOAD12:%.*]] = load i8, ptr [[GETELEMENTPTR11]], align 1
+; CHECK-NEXT:    [[GETELEMENTPTR13:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG3]], i64 [[PHI]]
+; CHECK-NEXT:    [[LOAD14:%.*]] = load i8, ptr [[GETELEMENTPTR13]], align 1
+; CHECK-NEXT:    [[GETELEMENTPTR15:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG4]], i64 [[PHI]]
+; CHECK-NEXT:    [[LOAD16:%.*]] = load i8, ptr [[GETELEMENTPTR15]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[LOAD16]], [[LOAD14]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[LOAD12]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[GETELEMENTPTR11]], align 1
+; CHECK-NEXT:    [[GETELEMENTPTR17:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG2]], i64 [[PHI]]
+; CHECK-NEXT:    [[LOAD18:%.*]] = load i8, ptr [[GETELEMENTPTR17]], align 1
+; CHECK-NEXT:    [[MUL19:%.*]] = mul i8 [[LOAD14]], [[LOAD14]]
+; CHECK-NEXT:    [[ADD20:%.*]] = add i8 [[LOAD18]], [[MUL19]]
+; CHECK-NEXT:    store i8 [[ADD20]], ptr [[GETELEMENTPTR17]], align 1
+; CHECK-NEXT:    br label %[[BB21]]
+; CHECK:       [[BB21]]:
+; CHECK-NEXT:    [[ADD22]] = add nuw nsw i64 [[PHI]], 1
+; CHECK-NEXT:    [[ICMP23:%.*]] = icmp eq i64 [[ADD22]], [[ZEXT]]
+; CHECK-NEXT:    br i1 [[ICMP23]], label %[[BB24]], label %[[BB8]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK:       [[BB24]]:
+; CHECK-NEXT:    br label %[[BB25]]
+; CHECK:       [[BB25]]:
+; CHECK-NEXT:    ret void
+;
+bb:
+  %icmp = icmp sgt i32 %arg, 0
+  br i1 %icmp, label %bb7, label %bb25
+
+bb7:                                              ; preds = %bb
+  %zext = zext nneg i32 %arg to i64
+  br label %bb8
+
+bb8:                                              ; preds = %bb21, %bb7
+  %phi = phi i64 [ 0, %bb7 ], [ %add22, %bb21 ]
+  %getelementptr = getelementptr inbounds nuw i8, ptr %arg5, i64 %phi
+  %load = load i8, ptr %getelementptr, align 1
+  %icmp9 = icmp ult i8 %load, %arg6
+  br i1 %icmp9, label %bb21, label %bb10
+
+bb10:                                             ; preds = %bb8
+  %getelementptr11 = getelementptr inbounds nuw i8, ptr %arg1, i64 %phi
+  %load12 = load i8, ptr %getelementptr11, align 1
+  %getelementptr13 = getelementptr inbounds nuw i8, ptr %arg3, i64 %phi
+  %load14 = load i8, ptr %getelementptr13, align 1
+  %getelementptr15 = getelementptr inbounds nuw i8, ptr %arg4, i64 %phi
+  %load16 = load i8, ptr %getelementptr15, align 1
+  %mul = mul i8 %load16, %load14
+  %add = add i8 %mul, %load12
+  store i8 %add, ptr %getelementptr11, align 1
+  %getelementptr17 = getelementptr inbounds nuw i8, ptr %arg2, i64 %phi
+  %load18 = load i8, ptr %getelementptr17, align 1
+  %mul19 = mul i8 %load14, %load14
+  %add20 = add i8 %load18, %mul19
+  store i8 %add20, ptr %getelementptr17, align 1
+  br label %bb21
+
+bb21:                                             ; preds = %bb10, %bb8
+  %add22 = add nuw nsw i64 %phi, 1
+  %icmp23 = icmp eq i64 %add22, %zext
+  br i1 %icmp23, label %bb24, label %bb8, !llvm.loop !0
+
+bb24:                                             ; preds = %bb21
+  br label %bb25
+
+bb25:                                             ; preds = %bb24, %bb
+  ret void
+}
+
+attributes #0 = { uwtable vscale_range(1,16) "aarch64_pstate_sm_body" "target-features"="+fp-armv8,+neon,+sme,+v8a,-fmv" }
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 16}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; CHECK: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; CHECK: [[META3]] = !{[[META4:![0-9]+]]}
+; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]}
+; CHECK: [[META5]] = !{[[META6:![0-9]+]], [[META1]], [[META7:![0-9]+]], [[META8:![0-9]+]]}
+; CHECK: [[META6]] = distinct !{[[META6]], [[META2]]}
+; CHECK: [[META7]] = distinct !{[[META7]], [[META2]]}
+; CHECK: [[META8]] = distinct !{[[META8]], [[META2]]}
+; CHECK: [[META9]] = !{[[META7]]}
+; CHECK: [[META10]] = !{[[META8]]}
+; CHECK: [[META11]] = !{[[META6]]}
+; CHECK: [[META12]] = !{[[META1]], [[META7]], [[META8]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META14:![0-9]+]], [[META15:![0-9]+]], [[META16:![0-9]+]]}
+; CHECK: [[META14]] = !{!"llvm.loop.mustprogress"}
+; CHECK: [[META15]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META16]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META14]], [[META15]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
index cdddcc9..68cfc65 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6
-; RUN: opt -p loop-vectorize -max-interleave-group-factor=4 -S %s | FileCheck %s
+; RUN: opt -p loop-vectorize -S %s | FileCheck %s
 
 target triple = "arm64-apple-macosx15.0.0"
 
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
index 54b7f2a..f2ae327 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
@@ -925,20 +925,20 @@ define void @same_op8_splat(ptr noalias noundef %a, ptr noundef %b, ptr noundef
 ; CHECK-SAME: ptr noalias noundef captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], ptr noundef readonly captures(none) [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[C]], align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <32 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <32 x float>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x float>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_VEC19:%.*]] = load <32 x float>, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <32 x float> [[WIDE_VEC]], [[TMP1]]
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd fast <32 x float> [[WIDE_VEC19]], [[TMP4]]
-; CHECK-NEXT:    store <32 x float> [[INTERLEAVED_VEC]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[WIDE_VEC19:%.*]] = load <16 x float>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <16 x float> [[WIDE_VEC]], [[TMP1]]
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd fast <16 x float> [[WIDE_VEC19]], [[TMP4]]
+; CHECK-NEXT:    store <16 x float> [[INTERLEAVED_VEC]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 144
 ; CHECK-NEXT:    br i1 [[TMP25]], label %[[FOR_END11:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[FOR_END11]]:
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll
new file mode 100644
index 0000000..2926371
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=MIR
+
+define i64 @test1(i64 %i) nounwind readnone {
+  %loc = alloca i64
+  %j = load i64, ptr %loc
+  %r = add i64 %i, %j
+  ret i64 %r
+}
+
+define i64 @test2(i32 %i) nounwind readnone {
+  %loc = alloca i32
+  %j = load i32, ptr %loc
+  %r = add i32 %i, %j
+  %ext = zext i32 %r to i64
+  ret i64 %ext
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected
new file mode 100644
index 0000000..88cb03e
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=MIR
+
+define i64 @test1(i64 %i) nounwind readnone {
+; ASM-LABEL: test1:
+; ASM:       # %bb.0:
+; ASM-NEXT:    movq %rdi, %rax
+; ASM-NEXT:    addq -{{[0-9]+}}(%rsp), %rax
+; ASM-NEXT:    retq
+; MIR-LABEL: name: test1
+; MIR: bb.0 (%ir-block.0):
+; MIR-NEXT:   liveins: $rdi
+; MIR-NEXT: {{  $}}
+; MIR-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rdi
+; MIR-NEXT:   [[ADD64rm:%[0-9]+]]:gr64 = ADD64rm [[COPY]], %stack.0.loc, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (dereferenceable load (s64) from %ir.loc)
+; MIR-NEXT:   $rax = COPY [[ADD64rm]]
+; MIR-NEXT:   RET 0, $rax
+  %loc = alloca i64
+  %j = load i64, ptr %loc
+  %r = add i64 %i, %j
+  ret i64 %r
+}
+
+define i64 @test2(i32 %i) nounwind readnone {
+; ASM-LABEL: test2:
+; ASM:       # %bb.0:
+; ASM-NEXT:    movl %edi, %eax
+; ASM-NEXT:    addl -{{[0-9]+}}(%rsp), %eax
+; ASM-NEXT:    retq
+; MIR-LABEL: name: test2
+; MIR: bb.0 (%ir-block.0):
+; MIR-NEXT:   liveins: $edi
+; MIR-NEXT: {{  $}}
+; MIR-NEXT:   [[COPY:%[0-9]+]]:gr32 = COPY $edi
+; MIR-NEXT:   [[ADD32rm:%[0-9]+]]:gr32 = ADD32rm [[COPY]], %stack.0.loc, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (dereferenceable load (s32) from %ir.loc)
+; MIR-NEXT:   [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, killed [[ADD32rm]], %subreg.sub_32bit
+; MIR-NEXT:   $rax = COPY [[SUBREG_TO_REG]]
+; MIR-NEXT:   RET 0, $rax
+  %loc = alloca i32
+  %j = load i32, ptr %loc
+  %r = add i32 %i, %j
+  %ext = zext i32 %r to i64
+  ret i64 %ext
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll
new file mode 100644
index 0000000..7167bcf
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=CHECK
+
+define i32 @add(i32 %a, i32 %b) {
+  %sum = add i32 %a, %b
+  ret i32 %sum
+}
+
+define i32 @sub(i32 %a, i32 %b) {
+  %diff = sub i32 %a, %b
+  ret i32 %diff
+}
+
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected
new file mode 100644
index 0000000..1ba920d
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=CHECK
+
+define i32 @add(i32 %a, i32 %b) {
+  %sum = add i32 %a, %b
+  ret i32 %sum
+}
+
+define i32 @sub(i32 %a, i32 %b) {
+  %diff = sub i32 %a, %b
+  ret i32 %diff
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test
new file mode 100644
index 0000000..6fc57b5
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test
@@ -0,0 +1,9 @@
+# REQUIRES: x86-registered-target
+## Test checking that update_llc_test_checks.py can generate both ASM and MIR checks in the same file
+
+# RUN: cp -f %S/Inputs/x86_asm_mir_mixed.ll %t.ll && %update_llc_test_checks %t.ll
+# RUN: diff -u %S/Inputs/x86_asm_mir_mixed.ll.expected %t.ll
+
+## Verify that running the script again on an already updated file doesn't add duplicate checks
+# RUN: %update_llc_test_checks %t.ll
+# RUN: diff -u %S/Inputs/x86_asm_mir_mixed.ll.expected %t.ll
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test
new file mode 100644
index 0000000..0f8aaa54
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test
@@ -0,0 +1,8 @@
+# REQUIRES: x86-registered-target
+## Test that using the same prefix for both ASM and MIR outputs generates a warning
+## and doesn't produce any checks.
+
+# RUN: cp -f %S/Inputs/x86_asm_mir_same_prefix.ll %t.ll && %update_llc_test_checks %t.ll 2>&1 | FileCheck %s --check-prefix=WARNING
+# RUN: diff -u %S/Inputs/x86_asm_mir_same_prefix.ll.expected %t.ll
+
+# WARNING: WARNING: The following prefixes are used for both ASM and MIR output, which will cause FileCheck failures: CHECK
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 59a9ea1..c1791dfa 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1132,8 +1132,7 @@ TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) {
       new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
   VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
   VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
-  VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, Load->getAlign(), {},
-                           {});
+  VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {}, {});
   EXPECT_TRUE(isa<VPUser>(&Recipe));
   VPRecipeBase *BaseR = &Recipe;
   EXPECT_TRUE(isa<VPUser>(BaseR));
@@ -1250,8 +1249,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
         new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
     VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
     VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
-    VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, Load->getAlign(),
-                             {}, {});
+    VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {}, {});
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_TRUE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1265,8 +1263,8 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
     VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
     VPValue *StoredV = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3));
-    VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false,
-                              Store->getAlign(), {}, {});
+    VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false, {},
+                              {});
     EXPECT_TRUE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_TRUE(Recipe.mayWriteToMemory());
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index 2dad16a..baa0377 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -605,6 +605,7 @@ TRIPLE_IR_RE = re.compile(r'^\s*target\s+triple\s*=\s*"([^"]+)"$')
 TRIPLE_ARG_RE = re.compile(r"-m?triple[= ]([^ ]+)")
 MARCH_ARG_RE = re.compile(r"-march[= ]([^ ]+)")
 DEBUG_ONLY_ARG_RE = re.compile(r"-debug-only[= ]([^ ]+)")
+STOP_PASS_RE = re.compile(r"-stop-(before|after)=(\w+)")
 
 IS_DEBUG_RECORD_RE = re.compile(r"^(\s+)#dbg_")
 IS_SWITCH_CASE_RE = re.compile(r"^\s+i\d+ \d+, label %\S+")
diff --git a/llvm/utils/UpdateTestChecks/mir.py b/llvm/utils/UpdateTestChecks/mir.py
index 24bb8b3..01ee0e1 100644
--- a/llvm/utils/UpdateTestChecks/mir.py
+++ b/llvm/utils/UpdateTestChecks/mir.py
@@ -163,13 +163,15 @@ def add_mir_checks_for_function(
     print_fixed_stack,
     first_check_is_next,
     at_the_function_name,
+    check_indent=None,
 ):
     printed_prefixes = set()
     for run in run_list:
         for prefix in run[0]:
             if prefix in printed_prefixes:
                 break
-            if not func_dict[prefix][func_name]:
+            # func_info can be empty if there was a prefix conflict.
+            if not func_dict[prefix].get(func_name):
                 continue
             if printed_prefixes:
                 # Add some space between different check prefixes.
@@ -185,6 +187,7 @@ def add_mir_checks_for_function(
                 func_dict[prefix][func_name],
                 print_fixed_stack,
                 first_check_is_next,
+                check_indent,
             )
             break
         else:
@@ -204,6 +207,7 @@ def add_mir_check_lines(
     func_info,
     print_fixed_stack,
     first_check_is_next,
+    check_indent=None,
 ):
     func_body = str(func_info).splitlines()
     if single_bb:
@@ -220,7 +224,10 @@ def add_mir_check_lines(
     first_line = func_body[0]
     indent = len(first_line) - len(first_line.lstrip(" "))
     # A check comment, indented the appropriate amount
-    check = "{:>{}}; {}".format("", indent, prefix)
+    if check_indent is not None:
+        check = "{}; {}".format(check_indent, prefix)
+    else:
+        check = "{:>{}}; {}".format("", indent, prefix)
 
     output_lines.append("{}-LABEL: name: {}".format(check, func_name))
 
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
index 3c3fdf7..9c64db5 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
@@ -38,6 +38,7 @@ static_library("bugprone") {
     "EasilySwappableParametersCheck.cpp",
     "EmptyCatchCheck.cpp",
     "ExceptionEscapeCheck.cpp",
+    "FloatLoopCounterCheck.cpp",
     "FoldInitTypeCheck.cpp",
     "ForwardDeclarationNamespaceCheck.cpp",
     "ForwardingReferenceOverloadCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
index 1eae289..16f914a 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
@@ -16,7 +16,6 @@ static_library("cert") {
   ]
   sources = [
     "CERTTidyModule.cpp",
-    "FloatLoopCounter.cpp",
     "LimitedRandomnessCheck.cpp",
     "MutatingCopyCheck.cpp",
     "ProperlySeededRandomGeneratorCheck.cpp",
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index cd9512f..b1f20a7 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -1,8 +1,3 @@
-Analysis/LoopAccessAnalysis/memcheck-ni.ll
-Analysis/MemorySSA/pr116227.ll
-Analysis/MemorySSA/pr43641.ll
-Analysis/MemorySSA/pr46574.ll
-Analysis/MemorySSA/update-remove-dead-blocks.ll
 Bitcode/fcmp-fast.ll
 Bitcode/flags.ll
 CodeGen/AArch64/cgdata-merge-local.ll
@@ -26,27 +21,12 @@ CodeGen/X86/nocfivalue.ll
 DebugInfo/AArch64/ir-outliner.ll
 DebugInfo/assignment-tracking/X86/hotcoldsplit.ll
 DebugInfo/Generic/block-asan.ll
-DebugInfo/KeyInstructions/Generic/loop-unswitch.ll
 DebugInfo/X86/asan_debug_info.ll
 LTO/X86/diagnostic-handler-remarks-with-hotness.ll
 Other/optimization-remarks-auto.ll
 Other/X86/debugcounter-partiallyinlinelibcalls.ll
-Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
-Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
-Transforms/AtomicExpand/AArch64/pcsections.ll
 Transforms/AtomicExpand/ARM/atomic-expansion-v7.ll
-Transforms/AtomicExpand/ARM/atomic-expansion-v8.ll
-Transforms/AtomicExpand/ARM/atomicrmw-fp.ll
-Transforms/AtomicExpand/Hexagon/atomicrmw-fp.ll
-Transforms/AtomicExpand/LoongArch/atomicrmw-fp.ll
-Transforms/AtomicExpand/Mips/atomicrmw-fp.ll
-Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll
-Transforms/AtomicExpand/RISCV/atomicrmw-fp.ll
-Transforms/AtomicExpand/SPARC/libcalls.ll
 Transforms/AtomicExpand/SPARC/partword.ll
-Transforms/AtomicExpand/X86/expand-atomic-rmw-fp.ll
-Transforms/AtomicExpand/X86/expand-atomic-rmw-initial-load.ll
-Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll
 Transforms/Attributor/align.ll
 Transforms/Attributor/ArgumentPromotion/2008-02-01-ReturnAttrs.ll
 Transforms/Attributor/ArgumentPromotion/2008-07-02-array-indexing.ll
@@ -265,14 +245,13 @@ Transforms/InstCombine/and2.ll
 Transforms/InstCombine/and-fcmp.ll
 Transforms/InstCombine/and.ll
 Transforms/InstCombine/and-or-icmps.ll
-Transforms/InstCombine/and-or-implied-cond-not.ll
 Transforms/InstCombine/apint-div1.ll
 Transforms/InstCombine/apint-div2.ll
 Transforms/InstCombine/ashr-demand.ll
 Transforms/InstCombine/atomic.ll
 Transforms/InstCombine/binop-cast.ll
-Transforms/InstCombine/binop-select.ll
 Transforms/InstCombine/binop-select-cast-of-select-cond.ll
+Transforms/InstCombine/binop-select.ll
 Transforms/InstCombine/bit-checks.ll
 Transforms/InstCombine/bitreverse.ll
 Transforms/InstCombine/branch.ll
@@ -298,7 +277,6 @@ Transforms/InstCombine/fold-ctpop-of-not.ll
 Transforms/InstCombine/fold-ext-eq-c-with-op.ll
 Transforms/InstCombine/free-inversion.ll
 Transforms/InstCombine/icmp-and-lowbit-mask.ll
-Transforms/InstCombine/icmp-equality-test.ll
 Transforms/InstCombine/icmp.ll
 Transforms/InstCombine/icmp-mul-and.ll
 Transforms/InstCombine/icmp-of-and-x.ll
@@ -307,7 +285,6 @@ Transforms/InstCombine/icmp-select-implies-common-op.ll
 Transforms/InstCombine/icmp-select.ll
 Transforms/InstCombine/icmp-with-selects.ll
 Transforms/InstCombine/intrinsic-select.ll
-Transforms/InstCombine/known-never-nan.ll
 Transforms/InstCombine/ldexp-ext.ll
 Transforms/InstCombine/ldexp.ll
 Transforms/InstCombine/load-bitcast-select.ll
@@ -347,13 +324,11 @@ Transforms/InstCombine/or.ll
 Transforms/InstCombine/pow-1.ll
 Transforms/InstCombine/pow-3.ll
 Transforms/InstCombine/pow-sqrt.ll
-Transforms/InstCombine/pr24354.ll
 Transforms/InstCombine/pull-conditional-binop-through-shift.ll
 Transforms/InstCombine/rem.ll
 Transforms/InstCombine/sdiv-canonicalize.ll
 Transforms/InstCombine/sdiv-guard.ll
 Transforms/InstCombine/select-and-or.ll
-Transforms/InstCombine/select-bitext.ll
 Transforms/InstCombine/select-cmp-br.ll
 Transforms/InstCombine/select-cmp.ll
 Transforms/InstCombine/select-factorize.ll
@@ -362,7 +337,6 @@ Transforms/InstCombine/select.ll
 Transforms/InstCombine/select-min-max.ll
 Transforms/InstCombine/select-of-symmetric-selects.ll
 Transforms/InstCombine/select-select.ll
-Transforms/InstCombine/select-with-extreme-eq-cond.ll
 Transforms/InstCombine/shift.ll
 Transforms/InstCombine/shuffle-select-narrow-inseltpoison.ll
 Transforms/InstCombine/shuffle-select-narrow.ll
@@ -512,66 +486,12 @@ Transforms/LoopBoundSplit/bug51866.ll
 Transforms/LoopBoundSplit/bug-loop-bound-split-phi-in-exit-block.ll
 Transforms/LoopBoundSplit/loop-bound-split.ll
 Transforms/LoopDeletion/invalidate-scev-after-hoisting.ll
-Transforms/LoopDistribute/basic-with-memchecks.ll
-Transforms/LoopDistribute/bounds-expansion-bug.ll
-Transforms/LoopDistribute/cross-partition-access.ll
-Transforms/LoopDistribute/debug-loc.ll
-Transforms/LoopDistribute/debug-print.ll
-Transforms/LoopDistribute/diagnostics.ll
-Transforms/LoopDistribute/followup.ll
-Transforms/LoopDistribute/laa-invalidation.ll
-Transforms/LoopDistribute/outside-use.ll
-Transforms/LoopDistribute/pointer-phi-in-loop.ll
-Transforms/LoopDistribute/scev-inserted-runtime-check.ll
-Transforms/LoopDistribute/symbolic-stride.ll
-Transforms/LoopFlatten/loop-flatten-version.ll
 Transforms/LoopIdiom/AArch64/byte-compare-index.ll
 Transforms/LoopIdiom/AArch64/find-first-byte.ll
 Transforms/LoopIdiom/RISCV/byte-compare-index.ll
-Transforms/LoopIdiom/X86/arithmetic-right-shift-until-zero.ll
-Transforms/LoopIdiom/X86/left-shift-until-bittest.ll
-Transforms/LoopIdiom/X86/left-shift-until-zero.ll
-Transforms/LoopIdiom/X86/logical-right-shift-until-zero-debuginfo.ll
-Transforms/LoopIdiom/X86/logical-right-shift-until-zero.ll
-Transforms/LoopLoadElim/forward.ll
-Transforms/LoopLoadElim/invalidate-laa-after-versioning.ll
-Transforms/LoopLoadElim/memcheck.ll
-Transforms/LoopLoadElim/pr47457.ll
-Transforms/LoopLoadElim/symbolic-stride.ll
-Transforms/LoopLoadElim/unknown-stride-known-dep.ll
-Transforms/LoopLoadElim/versioning-scev-invalidation.ll
-Transforms/LoopPredication/preserve-bpi.ll
-Transforms/LoopSimplifyCFG/constant-fold-branch.ll
-Transforms/LoopSimplifyCFG/handle_dead_exits.ll
-Transforms/LoopSimplifyCFG/invalidate-scev-dispositions-2.ll
-Transforms/LoopSimplifyCFG/invalidate-scev-dispositions.ll
-Transforms/LoopSimplifyCFG/lcssa.ll
-Transforms/LoopSimplifyCFG/live_block_marking.ll
-Transforms/LoopSimplifyCFG/mssa_update.ll
-Transforms/LoopSimplifyCFG/pr117537.ll
-Transforms/LoopSimplifyCFG/update_parents.ll
 Transforms/LoopUnroll/peel-last-iteration-expansion-cost.ll
 Transforms/LoopUnroll/peel-last-iteration-with-guards.ll
 Transforms/LoopUnroll/peel-last-iteration-with-variable-trip-count.ll
-Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
-Transforms/LoopVersioning/add-phi-update-users.ll
-Transforms/LoopVersioning/basic.ll
-Transforms/LoopVersioning/bound-check-partially-known.ll
-Transforms/LoopVersioning/crash-36998.ll
-Transforms/LoopVersioning/exit-block-dominates-rt-check-block.ll
-Transforms/LoopVersioning/incorrect-phi.ll
-Transforms/LoopVersioning/invalidate-laa-after-versioning.ll
-Transforms/LoopVersioning/lcssa.ll
-Transforms/LoopVersioningLICM/load-from-unknown-address.ll
-Transforms/LoopVersioningLICM/loopversioningLICM1.ll
-Transforms/LoopVersioningLICM/loopversioningLICM2.ll
-Transforms/LoopVersioningLICM/metadata.ll
-Transforms/LoopVersioning/loop-invariant-bound.ll
-Transforms/LoopVersioning/noalias.ll
-Transforms/LoopVersioning/noalias-version-twice.ll
-Transforms/LoopVersioning/single-iteration.ll
-Transforms/LoopVersioning/wrapping-pointer-non-integral-addrspace.ll
-Transforms/LoopVersioning/wrapping-pointer-versioning.ll
 Transforms/LowerAtomic/atomic-load.ll
 Transforms/LowerAtomic/atomic-swap.ll
 Transforms/LowerConstantIntrinsics/builtin-object-size-phi.ll
@@ -740,27 +660,6 @@ Transforms/Scalarizer/scatter-order.ll
 Transforms/Scalarizer/variable-extractelement.ll
 Transforms/Scalarizer/variable-insertelement.ll
 Transforms/Scalarizer/vector-of-pointer-to-vector.ll
-Transforms/SimpleLoopUnswitch/debuginfo.ll
-Transforms/SimpleLoopUnswitch/delete-dead-blocks.ll
-Transforms/SimpleLoopUnswitch/endless-unswitch.ll
-Transforms/SimpleLoopUnswitch/guards.ll
-Transforms/SimpleLoopUnswitch/inject-invariant-conditions-exponential.ll
-Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
-Transforms/SimpleLoopUnswitch/LIV-loop-condtion.ll
-Transforms/SimpleLoopUnswitch/nontrivial-unswitch-freeze.ll
-Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll
-Transforms/SimpleLoopUnswitch/nontrivial-unswitch-select.ll
-Transforms/SimpleLoopUnswitch/nontrivial-unswitch-skip-selects-in-guards.ll
-Transforms/SimpleLoopUnswitch/partial-unswitch.ll
-Transforms/SimpleLoopUnswitch/partial-unswitch-loop-and-block-dispositions.ll
-Transforms/SimpleLoopUnswitch/partial-unswitch-mssa-threshold.ll
-Transforms/SimpleLoopUnswitch/partial-unswitch-update-memoryssa.ll
-Transforms/SimpleLoopUnswitch/pr138509.ll
-Transforms/SimpleLoopUnswitch/pr59546.ll
-Transforms/SimpleLoopUnswitch/pr60736.ll
-Transforms/SimpleLoopUnswitch/trivial-unswitch-freeze-individual-conditions.ll
-Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
-Transforms/SimpleLoopUnswitch/trivial-unswitch-logical-and-or.ll
 Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
 Transforms/StructurizeCFG/callbr.ll
 Transforms/StructurizeCFG/hoist-zerocost.ll
diff --git a/llvm/utils/update_llc_test_checks.py b/llvm/utils/update_llc_test_checks.py
index 8c57e75..98864be 100755
--- a/llvm/utils/update_llc_test_checks.py
+++ b/llvm/utils/update_llc_test_checks.py
@@ -15,7 +15,7 @@ import argparse
 import os  # Used to advertise this file's name ("autogenerated_note").
 import sys
 
-from UpdateTestChecks import common
+from UpdateTestChecks import common, mir
 
 # llc is the only llc-like in the LLVM tree but downstream forks can add
 # additional ones here if they have them.
@@ -33,6 +33,7 @@ def update_test(ti: common.TestInfo):
             break
 
     run_list = []
+    mir_run_list = []
     for l in ti.run_lines:
         if "|" not in l:
             common.warn("Skipping unparsable RUN line: " + l)
@@ -57,9 +58,14 @@ def update_test(ti: common.TestInfo):
         if m:
             march_in_cmd = m.groups()[0]
 
+        target_list = run_list
         m = common.DEBUG_ONLY_ARG_RE.search(llc_cmd)
         if m and m.groups()[0] == "isel":
             from UpdateTestChecks import isel as output_type
+        elif not m and common.STOP_PASS_RE.search(llc_cmd):
+            # MIR output mode. If -debug-only is present assume
+            # the debug output is the main point of interest.
+            target_list = mir_run_list
         else:
             from UpdateTestChecks import asm as output_type
 
@@ -84,7 +90,7 @@ def update_test(ti: common.TestInfo):
 
         # FIXME: We should use multiple check prefixes to common check lines. For
         # now, we just ignore all but the last.
-        run_list.append(
+        target_list.append(
             (
                 check_prefixes,
                 llc_tool,
@@ -119,14 +125,20 @@ def update_test(ti: common.TestInfo):
         ginfo=ginfo,
     )
 
-    for (
-        prefixes,
-        llc_tool,
-        llc_args,
-        preprocess_cmd,
-        triple_in_cmd,
-        march_in_cmd,
-    ) in run_list:
+    # Dictionary to store MIR function bodies separately
+    mir_func_dict = {}
+    for run_tuple, is_mir in [(run, False) for run in run_list] + [
+        (run, True) for run in mir_run_list
+    ]:
+        (
+            prefixes,
+            llc_tool,
+            llc_args,
+            preprocess_cmd,
+            triple_in_cmd,
+            march_in_cmd,
+        ) = run_tuple
+
         common.debug("Extracted LLC cmd:", llc_tool, llc_args)
         common.debug("Extracted FileCheck prefixes:", str(prefixes))
 
@@ -141,22 +153,54 @@ def update_test(ti: common.TestInfo):
         if not triple:
             triple = common.get_triple_from_march(march_in_cmd)
 
-        scrubber, function_re = output_type.get_run_handler(triple)
-        if 0 == builder.process_run_line(
-            function_re, scrubber, raw_tool_output, prefixes
-        ):
-            common.warn(
-                "Couldn't match any function. Possibly the wrong target triple has been provided"
+        if is_mir:
+            # MIR output mode
+            common.debug("Detected MIR output mode for prefixes:", str(prefixes))
+            for prefix in prefixes:
+                if prefix not in mir_func_dict:
+                    mir_func_dict[prefix] = {}
+
+            mir.build_function_info_dictionary(
+                ti.path,
+                raw_tool_output,
+                triple,
+                prefixes,
+                mir_func_dict,
+                ti.args.verbose,
             )
-        builder.processed_prefixes(prefixes)
+        else:
+            # ASM output mode
+            scrubber, function_re = output_type.get_run_handler(triple)
+            if 0 == builder.process_run_line(
+                function_re, scrubber, raw_tool_output, prefixes
+            ):
+                common.warn(
+                    "Couldn't match any function. Possibly the wrong target triple has been provided"
+                )
+            builder.processed_prefixes(prefixes)
 
     func_dict = builder.finish_and_get_func_dict()
+
+    # Check for conflicts: same prefix used for both ASM and MIR
+    conflicting_prefixes = set(func_dict.keys()) & set(mir_func_dict.keys())
+    if conflicting_prefixes:
+        common.warn(
+            "The following prefixes are used for both ASM and MIR output, which will cause FileCheck failures: {}".format(
+                ", ".join(sorted(conflicting_prefixes))
+            ),
+            test_file=ti.path,
+        )
+        for prefix in conflicting_prefixes:
+            mir_func_dict[prefix] = {}
+            func_dict[prefix] = {}
+
     global_vars_seen_dict = {}
 
     is_in_function = False
     is_in_function_start = False
     func_name = None
     prefix_set = set([prefix for p in run_list for prefix in p[0]])
+    prefix_set.update([prefix for p in mir_run_list for prefix in p[0]])
     common.debug("Rewriting FileCheck prefixes:", str(prefix_set))
     output_lines = []
 
@@ -221,6 +265,22 @@ def update_test(ti: common.TestInfo):
                         is_filtered=builder.is_filtered(),
                     )
                 )
+
+                # Also add MIR checks if we have them for this function
+                if mir_run_list and func_name:
+                    mir.add_mir_checks_for_function(
+                        ti.path,
+                        output_lines,
+                        mir_run_list,
+                        mir_func_dict,
+                        func_name,
+                        single_bb=False,  # Don't skip basic block labels.
+                        print_fixed_stack=False,  # Don't print fixed stack (ASM tests don't need it).
+                        first_check_is_next=False,  # First check is LABEL, not NEXT.
+                        at_the_function_name=False,  # Use "name:" not "@name".
+                        check_indent="",  # No indentation for IR files (not MIR files).
+                    )
+
                 is_in_function_start = False
 
             if is_in_function:
diff --git a/mlir/docs/PassManagement.md b/mlir/docs/PassManagement.md
index a920d57..8d20b49 100644
--- a/mlir/docs/PassManagement.md
+++ b/mlir/docs/PassManagement.md
@@ -835,6 +835,12 @@ each pass, the generator produces a `registerPassName` where
 generates a `registerGroupPasses`, where `Group` is the tag provided via the
 `-name` input parameter, that registers all of the passes present.
 
+These declarations can be enabled for the whole group of passes by
+defining the `GEN_PASS_REGISTRATION` macro, or on a per-pass basis by
+defining `GEN_PASS_REGISTRATION_PASSNAME` where `PASSNAME` is the
+uppercase version of the name of the pass (similar to pass def and
+decls).
+
 ```c++
 // Tablegen options: -gen-pass-decls -name="Example"
 
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
index 0e42d08..b628f1a 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
@@ -395,7 +395,7 @@ def SPV_INTEL_fpga_buffer_location               : I32EnumAttrCase<"SPV_INTEL_fp
 def SPV_INTEL_arbitrary_precision_fixed_point    : I32EnumAttrCase<"SPV_INTEL_arbitrary_precision_fixed_point", 4019>;
 def SPV_INTEL_usm_storage_classes                : I32EnumAttrCase<"SPV_INTEL_usm_storage_classes", 4020>;
 def SPV_INTEL_io_pipes                           : I32EnumAttrCase<"SPV_INTEL_io_pipes", 4021>;
-def SPV_INTEL_blocking_pipes                     : I32EnumAttrCase<"SPV_INTEL_blocking_pipes", 4022>;
+def SPV_ALTERA_blocking_pipes                     : I32EnumAttrCase<"SPV_ALTERA_blocking_pipes", 4022>;
 def SPV_INTEL_fpga_reg                           : I32EnumAttrCase<"SPV_INTEL_fpga_reg", 4023>;
 def SPV_INTEL_long_constant_composite            : I32EnumAttrCase<"SPV_INTEL_long_constant_composite", 4024>;
 def SPV_INTEL_optnone                            : I32EnumAttrCase<"SPV_INTEL_optnone", 4025>;
@@ -465,7 +465,7 @@ def SPIRV_ExtensionAttr :
       SPV_INTEL_kernel_attributes, SPV_INTEL_fpga_memory_accesses,
       SPV_INTEL_fpga_cluster_attributes, SPV_INTEL_loop_fuse,
       SPV_INTEL_fpga_buffer_location, SPV_INTEL_arbitrary_precision_fixed_point,
-      SPV_INTEL_usm_storage_classes, SPV_INTEL_io_pipes, SPV_INTEL_blocking_pipes,
+      SPV_INTEL_usm_storage_classes, SPV_INTEL_io_pipes, SPV_ALTERA_blocking_pipes,
       SPV_INTEL_fpga_reg, SPV_INTEL_long_constant_composite, SPV_INTEL_optnone,
       SPV_INTEL_debug_module, SPV_INTEL_fp_fast_math_mode,
       SPV_INTEL_memory_access_aliasing, SPV_INTEL_split_barrier,
@@ -807,9 +807,9 @@ def SPIRV_C_IOPipesINTEL                                : I32EnumAttrCase<"IOPip
     Extension<[SPV_INTEL_io_pipes]>
   ];
 }
-def SPIRV_C_BlockingPipesINTEL                          : I32EnumAttrCase<"BlockingPipesINTEL", 5945> {
+def SPIRV_C_BlockingPipesALTERA                          : I32EnumAttrCase<"BlockingPipesALTERA", 5945> {
   list<Availability> availability = [
-    Extension<[SPV_INTEL_blocking_pipes]>
+    Extension<[SPV_ALTERA_blocking_pipes]>
   ];
 }
 def SPIRV_C_FPGARegINTEL                                : I32EnumAttrCase<"FPGARegINTEL", 5948> {
@@ -1519,7 +1519,7 @@ def SPIRV_CapabilityAttr :
       SPIRV_C_FPGAMemoryAccessesINTEL, SPIRV_C_FPGAClusterAttributesINTEL,
       SPIRV_C_LoopFuseINTEL, SPIRV_C_MemoryAccessAliasingINTEL,
       SPIRV_C_FPGABufferLocationINTEL, SPIRV_C_ArbitraryPrecisionFixedPointINTEL,
-      SPIRV_C_USMStorageClassesINTEL, SPIRV_C_IOPipesINTEL, SPIRV_C_BlockingPipesINTEL,
+      SPIRV_C_USMStorageClassesINTEL, SPIRV_C_IOPipesINTEL, SPIRV_C_BlockingPipesALTERA,
       SPIRV_C_FPGARegINTEL, SPIRV_C_DotProductInputAll,
       SPIRV_C_DotProductInput4x8BitPacked, SPIRV_C_DotProduct, SPIRV_C_RayCullMaskKHR,
       SPIRV_C_CooperativeMatrixKHR, SPIRV_C_ReplicatedCompositesEXT,
diff --git a/mlir/include/mlir/Support/Timing.h b/mlir/include/mlir/Support/Timing.h
index 3d61a0a..50ae847 100644
--- a/mlir/include/mlir/Support/Timing.h
+++ b/mlir/include/mlir/Support/Timing.h
@@ -473,6 +473,11 @@ void registerDefaultTimingManagerCLOptions();
 /// 'registerDefaultTimingManagerOptions' to a `DefaultTimingManager`.
 void applyDefaultTimingManagerCLOptions(DefaultTimingManager &tm);
 
+/// Create an output strategy for the specified format, to be passed to
+/// DefaultTimingManager::setOutput().
+std::unique_ptr<OutputStrategy>
+createOutputStrategy(DefaultTimingManager::OutputFormat fmt, raw_ostream &os);
+
 } // namespace mlir
 
 #endif // MLIR_SUPPORT_TIMING_H
diff --git a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
index 1e56810..7420412 100644
--- a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
+++ b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
@@ -328,7 +328,6 @@ static bool traverseRegionGraph(Region *begin,
              << nextRegion->getRegionNumber() << ", returning true";
       return true;
     }
-    llvm::dbgs() << "Region: " << nextRegion << "\n";
     if (!nextRegion->getParentOp()) {
       llvm::errs() << "Region " << *nextRegion << " has no parent op\n";
       return false;
diff --git a/mlir/lib/Support/Timing.cpp b/mlir/lib/Support/Timing.cpp
index 2e92d9c..b0ac379 100644
--- a/mlir/lib/Support/Timing.cpp
+++ b/mlir/lib/Support/Timing.cpp
@@ -619,11 +619,17 @@ void mlir::applyDefaultTimingManagerCLOptions(DefaultTimingManager &tm) {
     return;
   tm.setEnabled(options->timing);
   tm.setDisplayMode(options->displayMode);
+  tm.setOutput(createOutputStrategy(options->outputFormat, llvm::errs()));
+}
 
-  std::unique_ptr<OutputStrategy> printer;
-  if (options->outputFormat == OutputFormat::Text)
-    printer = std::make_unique<OutputTextStrategy>(llvm::errs());
-  else if (options->outputFormat == OutputFormat::Json)
-    printer = std::make_unique<OutputJsonStrategy>(llvm::errs());
-  tm.setOutput(std::move(printer));
+std::unique_ptr<OutputStrategy>
+mlir::createOutputStrategy(DefaultTimingManager::OutputFormat fmt,
+                           raw_ostream &os) {
+  switch (fmt) {
+  case OutputFormat::Text:
+    return std::make_unique<OutputTextStrategy>(os);
+  case OutputFormat::Json:
+    return std::make_unique<OutputJsonStrategy>(os);
+  }
+  llvm_unreachable("Invalid output format");
 }
diff --git a/mlir/tools/mlir-tblgen/PassGen.cpp b/mlir/tools/mlir-tblgen/PassGen.cpp
index f7134ce..f4b8eb4 100644
--- a/mlir/tools/mlir-tblgen/PassGen.cpp
+++ b/mlir/tools/mlir-tblgen/PassGen.cpp
@@ -57,19 +57,23 @@ const char *const passRegistrationCode = R"(
 //===----------------------------------------------------------------------===//
 // {0} Registration
 //===----------------------------------------------------------------------===//
+#ifdef {1}
 
 inline void register{0}() {{
   ::mlir::registerPass([]() -> std::unique_ptr<::mlir::Pass> {{
-    return {1};
+    return {2};
   });
 }
 
 // Old registration code, kept for temporary backwards compatibility.
 inline void register{0}Pass() {{
   ::mlir::registerPass([]() -> std::unique_ptr<::mlir::Pass> {{
-    return {1};
+    return {2};
   });
 }
+
+#undef {1}
+#endif // {1}
 )";
 
 /// The code snippet used to generate a function to register all passes in a
@@ -116,6 +120,10 @@ static std::string getPassDeclVarName(const Pass &pass) {
   return "GEN_PASS_DECL_" + pass.getDef()->getName().upper();
 }
 
+static std::string getPassRegistrationVarName(const Pass &pass) {
+  return "GEN_PASS_REGISTRATION_" + pass.getDef()->getName().upper();
+}
+
 /// Emit the code to be included in the public header of the pass.
 static void emitPassDecls(const Pass &pass, raw_ostream &os) {
   StringRef passName = pass.getDef()->getName();
@@ -143,18 +151,25 @@ static void emitPassDecls(const Pass &pass, raw_ostream &os) {
 /// PassRegistry.
 static void emitRegistrations(llvm::ArrayRef<Pass> passes, raw_ostream &os) {
   os << "#ifdef GEN_PASS_REGISTRATION\n";
+  os << "// Generate registrations for all passes.\n";
+  for (const Pass &pass : passes)
+    os << "#define " << getPassRegistrationVarName(pass) << "\n";
+  os << "#endif // GEN_PASS_REGISTRATION\n";
 
   for (const Pass &pass : passes) {
+    std::string passName = pass.getDef()->getName().str();
+    std::string passEnableVarName = getPassRegistrationVarName(pass);
+
     std::string constructorCall;
     if (StringRef constructor = pass.getConstructor(); !constructor.empty())
       constructorCall = constructor.str();
     else
-      constructorCall = formatv("create{0}()", pass.getDef()->getName()).str();
-
-    os << formatv(passRegistrationCode, pass.getDef()->getName(),
+      constructorCall = formatv("create{0}()", passName).str();
+    os << formatv(passRegistrationCode, passName, passEnableVarName,
                   constructorCall);
   }
 
+  os << "#ifdef GEN_PASS_REGISTRATION\n";
   os << formatv(passGroupRegistrationCode, groupName);
 
   for (const Pass &pass : passes)
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index b65fe64..ecd11b9 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -5281,6 +5281,7 @@ libc_function(
     hdrs = ["src/stdlib/strfromf.h"],
     deps = [
         ":__support_common",
+        ":printf_error_mapper",
         ":str_from_util",
     ],
 )
@@ -5291,6 +5292,7 @@ libc_function(
     hdrs = ["src/stdlib/strfromd.h"],
     deps = [
         ":__support_common",
+        ":printf_error_mapper",
         ":str_from_util",
     ],
 )
@@ -5301,6 +5303,7 @@ libc_function(
     hdrs = ["src/stdlib/strfroml.h"],
     deps = [
         ":__support_common",
+        ":printf_error_mapper",
         ":str_from_util",
     ],
 )
@@ -6514,12 +6517,34 @@ libc_support_library(
 )
 
 libc_support_library(
+    name = "printf_error_mapper",
+    hdrs = [
+        "src/stdio/printf_core/error_mapper.h",
+    ] + select({
+        "@platforms//os:linux": [
+            "src/stdio/printf_core/linux/error_mapper.h",
+        ],
+        "//conditions:default": [
+            "src/stdio/printf_core/generic/error_mapper.h",
+        ],
+    }),
+    deps = [
+        ":__support_cpp_type_traits",
+        ":__support_error_or",
+        ":__support_macros_properties_architectures",
+        ":hdr_errno_macros",
+        ":printf_core_structs",
+    ],
+)
+
+libc_support_library(
     name = "printf_main",
     hdrs = ["src/stdio/printf_core/printf_main.h"],
     deps = [
         ":__support_arg_list",
         ":printf_converter",
         ":printf_core_structs",
+        ":printf_error_mapper",
         ":printf_parser",
         ":printf_writer",
     ],
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel
index cbc6d13..e33199c 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel
@@ -87,6 +87,8 @@ libc_test(
     name = "fprintf_test",
     srcs = ["fprintf_test.cpp"],
     deps = [
+        "//libc:__support_cpp_limits",
+        "//libc:__support_macros_properties_architectures",
         "//libc:fprintf",
     ],
 )