[𝘀𝗽𝗿] changes introduced through rebaseusers/boomanaiden154/main.ci-migrate-to-runtimes-build

Created using spr 1.3.6 [skip ci]
author: Aiden Grossman <aidengrossman@google.com> 2025-06-09 22:16:17 -0700
committer: Aiden Grossman <agrossman154@yahoo.com> 2025-06-09 22:16:17 -0700
commit: 80e157abcc3ad4288c78d7f65d3308d46a6ce925 (patch)
tree: 010ffc72e73a1434300f41fbf53225a4eb454641
parent: 26fdf090280a2e055ed2da6714671d042bb15be0 (diff)
parent: 591678bebd09f5d9c5781dd7a9cbd19cb8cd532c (diff)
download: llvm-users/boomanaiden154/main.ci-migrate-to-runtimes-build.zip
llvm-users/boomanaiden154/main.ci-migrate-to-runtimes-build.tar.gz
llvm-users/boomanaiden154/main.ci-migrate-to-runtimes-build.tar.bz2
614 files changed, 20816 insertions, 9616 deletions
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index 30d9f6b..01569c6 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -100,3 +100,9 @@ d33bf2e9df578ff7e44fd22504d6ad5a122b7ee6
 
 # [lldb][NFC] clang-format MainLoopPosix.cpp
 66bdbfbaa08fa3d8e64a7fe136a8fb717f5cdbb7
+
+# [clang-tidy][NFC] Run clang-format on "clang-tools-extra/clang-tidy"
+65d66625b3e2b8322ed99d82edabecbafcd0885b
+ce46adb8b7ce645353eccaedf31ed9765dab77bb
+68070f908bb7ac5f0b5fa9722caa504ecf723f6b
+5213c57cb1f0d78aad9a253b7f6a2b62ff4c7859
diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index cb8e81b..3f07a6d 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -78,6 +78,13 @@ public:
   static bool checkPerfDataMagic(StringRef FileName);
 
 private:
+  struct LBREntry {
+    uint64_t From;
+    uint64_t To;
+    bool Mispred;
+  };
+  friend raw_ostream &operator<<(raw_ostream &OS, const LBREntry &);
+
   struct PerfBranchSample {
     SmallVector<LBREntry, 32> LBR;
   };
@@ -476,7 +483,6 @@ private:
 
   /// Debugging dump methods
   void dump() const;
-  void dump(const LBREntry &LBR) const;
   void dump(const PerfBranchSample &Sample) const;
   void dump(const PerfMemSample &Sample) const;
 
@@ -504,6 +510,12 @@ public:
 
   friend class YAMLProfileWriter;
 };
+
+inline raw_ostream &operator<<(raw_ostream &OS,
+                               const DataAggregator::LBREntry &L) {
+  OS << formatv("{0:x} -> {1:x}/{2}", L.From, L.To, L.Mispred ? 'M' : 'P');
+  return OS;
+}
 } // namespace bolt
 } // namespace llvm
 
diff --git a/bolt/include/bolt/Profile/DataReader.h b/bolt/include/bolt/Profile/DataReader.h
index 5df1b5a8..3c770fe 100644
--- a/bolt/include/bolt/Profile/DataReader.h
+++ b/bolt/include/bolt/Profile/DataReader.h
@@ -32,18 +32,6 @@ namespace bolt {
 
 class BinaryFunction;
 
-struct LBREntry {
-  uint64_t From;
-  uint64_t To;
-  bool Mispred;
-};
-
-inline raw_ostream &operator<<(raw_ostream &OS, const LBREntry &LBR) {
-  OS << "0x" << Twine::utohexstr(LBR.From) << " -> 0x"
-     << Twine::utohexstr(LBR.To);
-  return OS;
-}
-
 struct Location {
   bool IsSymbol;
   StringRef Name;
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index f05c40e..4022212 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -580,8 +580,10 @@ void DataAggregator::processProfile(BinaryContext &BC) {
     }
   }
 
-  for (auto &FuncBranches : NamesToBranches)
+  for (auto &FuncBranches : NamesToBranches) {
     llvm::stable_sort(FuncBranches.second.Data);
+    llvm::stable_sort(FuncBranches.second.EntryData);
+  }
 
   for (auto &MemEvents : NamesToMemEvents)
     llvm::stable_sort(MemEvents.second.Data);
@@ -733,8 +735,10 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
   // corresponds to a return (if \p IsFrom) or a call continuation (otherwise).
   auto handleAddress = [&](uint64_t &Addr, bool IsFrom) {
     BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr);
-    if (!Func)
+    if (!Func) {
+      Addr = 0;
       return std::pair{Func, false};
+    }
 
     Addr -= Func->getAddress();
 
@@ -972,7 +976,7 @@ bool DataAggregator::recordExit(BinaryFunction &BF, uint64_t From, bool Mispred,
   return true;
 }
 
-ErrorOr<LBREntry> DataAggregator::parseLBREntry() {
+ErrorOr<DataAggregator::LBREntry> DataAggregator::parseLBREntry() {
   LBREntry Res;
   ErrorOr<StringRef> FromStrRes = parseString('/');
   if (std::error_code EC = FromStrRes.getError())
@@ -1430,54 +1434,16 @@ void DataAggregator::parseLBRSample(const PerfBranchSample &Sample,
       const uint64_t TraceTo = NextLBR->From;
       const BinaryFunction *TraceBF =
           getBinaryFunctionContainingAddress(TraceFrom);
-      if (opts::HeatmapMode == opts::HeatmapModeKind::HM_Exclusive) {
-        FTInfo &Info = FallthroughLBRs[Trace(TraceFrom, TraceTo)];
+      FTInfo &Info = FallthroughLBRs[Trace(TraceFrom, TraceTo)];
+      if (TraceBF && TraceBF->containsAddress(LBR.From))
         ++Info.InternCount;
-      } else if (TraceBF && TraceBF->containsAddress(TraceTo)) {
-        FTInfo &Info = FallthroughLBRs[Trace(TraceFrom, TraceTo)];
-        if (TraceBF->containsAddress(LBR.From))
-          ++Info.InternCount;
-        else
-          ++Info.ExternCount;
-      } else {
-        const BinaryFunction *ToFunc =
-            getBinaryFunctionContainingAddress(TraceTo);
-        if (TraceBF && ToFunc) {
-          LLVM_DEBUG({
-            dbgs() << "Invalid trace starting in " << TraceBF->getPrintName()
-                   << formatv(" @ {0:x}", TraceFrom - TraceBF->getAddress())
-                   << formatv(" and ending @ {0:x}\n", TraceTo);
-          });
-          ++NumInvalidTraces;
-        } else {
-          LLVM_DEBUG({
-            dbgs() << "Out of range trace starting in "
-                   << (TraceBF ? TraceBF->getPrintName() : "None")
-                   << formatv(" @ {0:x}",
-                              TraceFrom - (TraceBF ? TraceBF->getAddress() : 0))
-                   << " and ending in "
-                   << (ToFunc ? ToFunc->getPrintName() : "None")
-                   << formatv(" @ {0:x}\n",
-                              TraceTo - (ToFunc ? ToFunc->getAddress() : 0));
-          });
-          ++NumLongRangeTraces;
-        }
-      }
+      else
+        ++Info.ExternCount;
       ++NumTraces;
     }
     NextLBR = &LBR;
 
-    // Record branches outside binary functions for heatmap.
-    if (opts::HeatmapMode == opts::HeatmapModeKind::HM_Exclusive) {
-      TakenBranchInfo &Info = BranchLBRs[Trace(LBR.From, LBR.To)];
-      ++Info.TakenCount;
-      continue;
-    }
-    uint64_t From = getBinaryFunctionContainingAddress(LBR.From) ? LBR.From : 0;
-    uint64_t To = getBinaryFunctionContainingAddress(LBR.To) ? LBR.To : 0;
-    if (!From && !To)
-      continue;
-    TakenBranchInfo &Info = BranchLBRs[Trace(From, To)];
+    TakenBranchInfo &Info = BranchLBRs[Trace(LBR.From, LBR.To)];
     ++Info.TakenCount;
     Info.MispredCount += LBR.Mispred;
   }
@@ -2398,16 +2364,10 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
 
 void DataAggregator::dump() const { DataReader::dump(); }
 
-void DataAggregator::dump(const LBREntry &LBR) const {
-  Diag << "From: " << Twine::utohexstr(LBR.From)
-       << " To: " << Twine::utohexstr(LBR.To) << " Mispred? " << LBR.Mispred
-       << "\n";
-}
-
 void DataAggregator::dump(const PerfBranchSample &Sample) const {
   Diag << "Sample LBR entries: " << Sample.LBR.size() << "\n";
   for (const LBREntry &LBR : Sample.LBR)
-    dump(LBR);
+    Diag << LBR << '\n';
 }
 
 void DataAggregator::dump(const PerfMemSample &Sample) const {
diff --git a/bolt/test/X86/pre-aggregated-perf.test b/bolt/test/X86/pre-aggregated-perf.test
index 92e093c..c4f5b8f 100644
--- a/bolt/test/X86/pre-aggregated-perf.test
+++ b/bolt/test/X86/pre-aggregated-perf.test
@@ -36,26 +36,26 @@ RUN: llvm-bolt %t.exe -p %p/Inputs/pre-aggregated.txt --pa -o %t.null | FileChec
 
 CHECK: BOLT-INFO: 4 out of 7 functions in the binary (57.1%) have non-empty execution profile
 
-RUN: cat %t | sort | FileCheck %s -check-prefix=PERF2BOLT
-RUN: cat %t.new | FileCheck %s -check-prefix=NEWFORMAT
+RUN: FileCheck %s -check-prefix=PERF2BOLT --input-file %t
+RUN: FileCheck %s -check-prefix=NEWFORMAT --input-file %t.new
 
 ## Test --profile-format option with perf2bolt
 RUN: perf2bolt %t.exe -o %t.fdata --pa -p %p/Inputs/pre-aggregated.txt \
 RUN:   --profile-format=fdata
-RUN: cat %t.fdata | sort | FileCheck %s -check-prefix=PERF2BOLT
+RUN: FileCheck %s -check-prefix=PERF2BOLT --input-file %t.fdata
 
 RUN: perf2bolt %t.exe -o %t.yaml --pa -p %p/Inputs/pre-aggregated.txt \
 RUN:   --profile-format=yaml --profile-use-dfs
-RUN: cat %t.yaml | FileCheck %s -check-prefix=NEWFORMAT
+RUN: FileCheck %s -check-prefix=NEWFORMAT --input-file %t.yaml
 
 ## Test --profile-format option with llvm-bolt --aggregate-only
 RUN: llvm-bolt %t.exe -o %t.bolt.fdata --pa -p %p/Inputs/pre-aggregated.txt \
 RUN:   --aggregate-only --profile-format=fdata
-RUN: cat %t.bolt.fdata | sort | FileCheck %s -check-prefix=PERF2BOLT
+RUN: FileCheck %s -check-prefix=PERF2BOLT --input-file %t.bolt.fdata
 
 RUN: llvm-bolt %t.exe -o %t.bolt.yaml --pa -p %p/Inputs/pre-aggregated.txt \
 RUN:   --aggregate-only --profile-format=yaml --profile-use-dfs
-RUN: cat %t.bolt.yaml | FileCheck %s -check-prefix=NEWFORMAT
+RUN: FileCheck %s -check-prefix=NEWFORMAT --input-file %t.bolt.yaml
 
 ## Test pre-aggregated basic profile
 RUN: perf2bolt %t.exe -o %t --pa -p %p/Inputs/pre-aggregated-basic.txt -o %t.ba \
@@ -67,16 +67,17 @@ BASIC-ERROR: BOLT-INFO: 0 out of 7 functions in the binary (0.0%) have non-empty
 BASIC-SUCCESS: BOLT-INFO: 4 out of 7 functions in the binary (57.1%) have non-empty execution profile
 CHECK-BASIC-NL: no_lbr cycles
 
-PERF2BOLT: 0 [unknown] 7f36d18d60c0 1 main 53c 0 2
-PERF2BOLT: 1 main 451 1 SolveCubic 0 0 2
-PERF2BOLT: 1 main 490 0 [unknown] 4005f0 0 1
-PERF2BOLT: 1 main 537 0 [unknown] 400610 0 1
-PERF2BOLT: 1 usqrt 30 1 usqrt 32 0 22
-PERF2BOLT: 1 usqrt 30 1 usqrt 39 4 33
-PERF2BOLT: 1 usqrt 35 1 usqrt 39 0 22
-PERF2BOLT: 1 usqrt 3d 1 usqrt 10 0 58
-PERF2BOLT: 1 usqrt 3d 1 usqrt 3f 0 22
-PERF2BOLT: 1 usqrt a 1 usqrt 10 0 22
+PERF2BOLT: 1 frame_dummy/1 1e 1 frame_dummy/1 0 0 1
+PERF2BOLT-NEXT: 1 main 451 1 SolveCubic 0 0 2
+PERF2BOLT-NEXT: 1 main 490 0 [unknown] 0 0 1
+PERF2BOLT-NEXT: 1 main 537 0 [unknown] 0 0 1
+PERF2BOLT-NEXT: 0 [unknown] 0 1 main 53c 0 2
+PERF2BOLT-NEXT: 1 usqrt a 1 usqrt 10 0 22
+PERF2BOLT-NEXT: 1 usqrt 30 1 usqrt 32 0 22
+PERF2BOLT-NEXT: 1 usqrt 30 1 usqrt 39 4 33
+PERF2BOLT-NEXT: 1 usqrt 35 1 usqrt 39 0 22
+PERF2BOLT-NEXT: 1 usqrt 3d 1 usqrt 10 0 58
+PERF2BOLT-NEXT: 1 usqrt 3d 1 usqrt 3f 0 22
 
 NEWFORMAT:  - name:            'frame_dummy/1'
 NEWFORMAT:    fid:             3
diff --git a/clang-tools-extra/clang-tidy/ClangTidy.cpp b/clang-tools-extra/clang-tidy/ClangTidy.cpp
index 38d58bc..f4ab93b 100644
--- a/clang-tools-extra/clang-tidy/ClangTidy.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidy.cpp
@@ -419,8 +419,8 @@ ClangTidyASTConsumerFactory::createASTConsumer(
 
   std::unique_ptr<ClangTidyProfiling> Profiling;
   if (Context.getEnableProfiling()) {
-    Profiling = std::make_unique<ClangTidyProfiling>(
-        Context.getProfileStorageParams());
+    Profiling =
+        std::make_unique<ClangTidyProfiling>(Context.getProfileStorageParams());
     FinderOptions.CheckProfiling.emplace(Profiling->Records);
   }
 
@@ -432,8 +432,8 @@ ClangTidyASTConsumerFactory::createASTConsumer(
 
   if (Context.canEnableModuleHeadersParsing() &&
       Context.getLangOpts().Modules && OverlayFS != nullptr) {
-    auto ModuleExpander = std::make_unique<ExpandModularHeadersPPCallbacks>(
-        &Compiler, OverlayFS);
+    auto ModuleExpander =
+        std::make_unique<ExpandModularHeadersPPCallbacks>(&Compiler, OverlayFS);
     ModuleExpanderPP = ModuleExpander->getPreprocessor();
     PP->addPPCallbacks(std::move(ModuleExpander));
   }
@@ -497,7 +497,7 @@ getCheckNames(const ClangTidyOptions &Options,
               bool AllowEnablingAnalyzerAlphaCheckers) {
   clang::tidy::ClangTidyContext Context(
       std::make_unique<DefaultOptionsProvider>(ClangTidyGlobalOptions(),
-                                                Options),
+                                               Options),
       AllowEnablingAnalyzerAlphaCheckers);
   ClangTidyASTConsumerFactory Factory(Context);
   return Factory.getCheckNames();
@@ -508,7 +508,7 @@ getCheckOptions(const ClangTidyOptions &Options,
                 bool AllowEnablingAnalyzerAlphaCheckers) {
   clang::tidy::ClangTidyContext Context(
       std::make_unique<DefaultOptionsProvider>(ClangTidyGlobalOptions(),
-                                                Options),
+                                               Options),
       AllowEnablingAnalyzerAlphaCheckers);
   ClangTidyDiagnosticConsumer DiagConsumer(Context);
   auto DiagOpts = std::make_unique<DiagnosticOptions>();
diff --git a/clang-tools-extra/clang-tidy/ClangTidyCheck.h b/clang-tools-extra/clang-tidy/ClangTidyCheck.h
index 037526a..399d459 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyCheck.h
+++ b/clang-tools-extra/clang-tidy/ClangTidyCheck.h
@@ -530,7 +530,6 @@ void ClangTidyCheck::OptionsView::store<bool>(
     ClangTidyOptions::OptionMap &Options, StringRef LocalName,
     bool Value) const;
 
-
 } // namespace tidy
 } // namespace clang
 
diff --git a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.h b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.h
index bd7a1bf..a8851e7 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.h
+++ b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.h
@@ -217,11 +217,10 @@ public:
   using DiagLevelAndFormatString = std::pair<DiagnosticIDs::Level, std::string>;
   DiagLevelAndFormatString getDiagLevelAndFormatString(unsigned DiagnosticID,
                                                        SourceLocation Loc) {
-    return {
-        static_cast<DiagnosticIDs::Level>(
-            DiagEngine->getDiagnosticLevel(DiagnosticID, Loc)),
-        std::string(
-            DiagEngine->getDiagnosticIDs()->getDescription(DiagnosticID))};
+    return {static_cast<DiagnosticIDs::Level>(
+                DiagEngine->getDiagnosticLevel(DiagnosticID, Loc)),
+            std::string(
+                DiagEngine->getDiagnosticIDs()->getDescription(DiagnosticID))};
   }
 
   void setOptionsCollector(llvm::StringSet<> *Collector) {
diff --git a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
index 1c480d1..e59f157 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
@@ -70,7 +70,8 @@ struct NOptionMap {
   NOptionMap(IO &, const ClangTidyOptions::OptionMap &OptionMap) {
     Options.reserve(OptionMap.size());
     for (const auto &KeyValue : OptionMap)
-      Options.emplace_back(std::string(KeyValue.getKey()), KeyValue.getValue().Value);
+      Options.emplace_back(std::string(KeyValue.getKey()),
+                           KeyValue.getValue().Value);
   }
   ClangTidyOptions::OptionMap denormalize(IO &) {
     ClangTidyOptions::OptionMap Map;
diff --git a/clang-tools-extra/clang-tidy/ClangTidyOptions.h b/clang-tools-extra/clang-tidy/ClangTidyOptions.h
index dd78c57..6ddc5f9 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyOptions.h
+++ b/clang-tools-extra/clang-tidy/ClangTidyOptions.h
@@ -204,7 +204,9 @@ class FileOptionsBaseProvider : public DefaultOptionsProvider {
 protected:
   // A pair of configuration file base name and a function parsing
   // configuration from text in the corresponding format.
-  using ConfigFileHandler = std::pair<std::string, std::function<llvm::ErrorOr<ClangTidyOptions> (llvm::MemoryBufferRef)>>;
+  using ConfigFileHandler =
+      std::pair<std::string, std::function<llvm::ErrorOr<ClangTidyOptions>(
+                                 llvm::MemoryBufferRef)>>;
 
   /// Configuration file handlers listed in the order of priority.
   ///
diff --git a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp
index 6a84704..2c17cd3 100644
--- a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp
+++ b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp
@@ -49,8 +49,8 @@ public:
     FilesToRecord.erase(File);
   }
 
-  /// Makes sure we have contents for all the files we were interested in. Ideally
-  /// `FilesToRecord` should be empty.
+  /// Makes sure we have contents for all the files we were interested in.
+  /// Ideally `FilesToRecord` should be empty.
   void checkAllFilesRecorded() {
     LLVM_DEBUG({
       for (auto FileEntry : FilesToRecord)
diff --git a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h
index c347891..e599bda 100644
--- a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h
+++ b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.h
@@ -35,10 +35,10 @@ namespace tooling {
 /// including the contents of the modular headers and all their transitive
 /// includes.
 ///
-/// This allows existing tools based on PPCallbacks to retain their functionality
-/// when running with C++ modules enabled. This only works in the backwards
-/// compatible modules mode, i.e. when code can still be parsed in non-modular
-/// way.
+/// This allows existing tools based on PPCallbacks to retain their
+/// functionality when running with C++ modules enabled. This only works in the
+/// backwards compatible modules mode, i.e. when code can still be parsed in
+/// non-modular way.
 class ExpandModularHeadersPPCallbacks : public PPCallbacks {
 public:
   ExpandModularHeadersPPCallbacks(
diff --git a/clang-tools-extra/clang-tidy/concurrency/ConcurrencyTidyModule.cpp b/clang-tools-extra/clang-tidy/concurrency/ConcurrencyTidyModule.cpp
index a88ee46..8d74d03 100644
--- a/clang-tools-extra/clang-tidy/concurrency/ConcurrencyTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/concurrency/ConcurrencyTidyModule.cpp
@@ -27,7 +27,8 @@ public:
 
 } // namespace concurrency
 
-// Register the ConcurrencyTidyModule using this statically initialized variable.
+// Register the ConcurrencyTidyModule using this statically initialized
+// variable.
 static ClangTidyModuleRegistry::Add<concurrency::ConcurrencyModule>
     X("concurrency-module", "Adds concurrency checks.");
 
diff --git a/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.h b/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.h
index 036db35..79d8cf9 100644
--- a/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.h
+++ b/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.h
@@ -13,7 +13,8 @@
 
 namespace clang::tidy::hicpp {
 
-/// Check for thrown exceptions and enforce they are all derived from std::exception.
+/// Check for thrown exceptions and enforce they are all derived from
+/// std::exception.
 ///
 /// For the user-facing documentation see:
 /// http://clang.llvm.org/extra/clang-tidy/checks/hicpp/exception-baseclass.html
diff --git a/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp b/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp
index 7028c39..3f5cd4b 100644
--- a/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp
+++ b/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.cpp
@@ -113,7 +113,7 @@ void MultiwayPathsCoveredCheck::check(const MatchFinder::MatchResult &Result) {
   }
   // Warns for degenerated 'switch' statements that neither define a case nor
   // a default label.
-  // FIXME: Evaluate, if emitting a fix-it to simplify that statement is 
+  // FIXME: Evaluate, if emitting a fix-it to simplify that statement is
   // reasonable.
   if (!SwitchHasDefault && SwitchCaseCount == 0) {
     diag(Switch->getBeginLoc(),
diff --git a/clang-tools-extra/clang-tidy/linuxkernel/MustCheckErrsCheck.cpp b/clang-tools-extra/clang-tidy/linuxkernel/MustCheckErrsCheck.cpp
index 149d0a9..ce501ac 100644
--- a/clang-tools-extra/clang-tidy/linuxkernel/MustCheckErrsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/linuxkernel/MustCheckErrsCheck.cpp
@@ -19,8 +19,7 @@ void MustCheckErrsCheck::registerMatchers(MatchFinder *Finder) {
                               "ERR_CAST", "PTR_ERR_OR_ZERO"));
   auto NonCheckingStmts = stmt(anyOf(compoundStmt(), labelStmt()));
   Finder->addMatcher(
-      callExpr(callee(ErrFn), hasParent(NonCheckingStmts)).bind("call"),
-      this);
+      callExpr(callee(ErrFn), hasParent(NonCheckingStmts)).bind("call"), this);
 
   auto ReturnToCheck = returnStmt(hasReturnValue(callExpr(callee(ErrFn))));
   auto ReturnsErrFn = functionDecl(hasDescendant(ReturnToCheck));
diff --git a/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.h b/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.h
index dd9c5fe..5b61109 100644
--- a/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.h
+++ b/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.h
@@ -15,8 +15,9 @@ namespace clang::tidy::llvm_check {
 
 /// Looks at conditionals and finds and replaces cases of ``cast<>``, which will
 /// assert rather than return a null pointer, and ``dyn_cast<>`` where
-/// the return value is not captured.  Additionally, finds and replaces cases that match the
-/// pattern ``var && isa<X>(var)``, where ``var`` is evaluated twice.
+/// the return value is not captured.  Additionally, finds and replaces cases
+/// that match the pattern ``var && isa<X>(var)``, where ``var`` is evaluated
+/// twice.
 ///
 /// Finds cases like these:
 /// \code
diff --git a/clang-tools-extra/clang-tidy/misc/ThrowByValueCatchByReferenceCheck.h b/clang-tools-extra/clang-tidy/misc/ThrowByValueCatchByReferenceCheck.h
index 0d4df97..e3cc4c5 100644
--- a/clang-tools-extra/clang-tidy/misc/ThrowByValueCatchByReferenceCheck.h
+++ b/clang-tools-extra/clang-tidy/misc/ThrowByValueCatchByReferenceCheck.h
@@ -13,7 +13,7 @@
 
 namespace clang::tidy::misc {
 
-///checks for locations that do not throw by value
+/// Checks for locations that do not throw by value
 // or catch by reference.
 // The check is C++ only. It checks that all throw locations
 // throw by value and not by pointer. Additionally it
diff --git a/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.h b/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.h
index eb791ec..09cfebe 100644
--- a/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.h
@@ -13,8 +13,8 @@
 
 namespace clang::tidy::modernize {
 
-/// This check warns the uses of the deprecated member types of ``std::ios_base``
-/// and replaces those that have a non-deprecated equivalent.
+/// This check warns the uses of the deprecated member types of
+/// ``std::ios_base`` and replaces those that have a non-deprecated equivalent.
 ///
 /// For the user-facing documentation see:
 /// http://clang.llvm.org/extra/clang-tidy/checks/modernize/deprecated-ios-base-aliases.html
diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp b/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp
index 93c231b..3d0a1f0 100644
--- a/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp
@@ -688,9 +688,8 @@ bool ForLoopIndexUseVisitor::TraverseArraySubscriptExpr(ArraySubscriptExpr *E) {
   if (!isIndexInSubscriptExpr(E->getIdx(), IndexVar))
     return VisitorBase::TraverseArraySubscriptExpr(E);
 
-  if ((ContainerExpr &&
-       !areSameExpr(Context, Arr->IgnoreParenImpCasts(),
-                    ContainerExpr->IgnoreParenImpCasts())) ||
+  if ((ContainerExpr && !areSameExpr(Context, Arr->IgnoreParenImpCasts(),
+                                     ContainerExpr->IgnoreParenImpCasts())) ||
       !arrayMatchesBoundExpr(Context, Arr->IgnoreImpCasts()->getType(),
                              ArrayBoundExpr)) {
     // If we have already discovered the array being indexed and this isn't it
diff --git a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
index ae88ec2..c2db858 100644
--- a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
@@ -161,7 +161,7 @@ public:
     checkName(MacroNameTok);
   }
   void Elifdef(SourceLocation Loc, SourceRange ConditionRange,
-      SourceLocation IfLoc) override {
+               SourceLocation IfLoc) override {
     PPCallbacks::Elifdef(Loc, ConditionRange, IfLoc);
   }
   void Elifndef(SourceLocation Loc, const Token &MacroNameTok,
@@ -169,7 +169,7 @@ public:
     checkName(MacroNameTok);
   }
   void Elifndef(SourceLocation Loc, SourceRange ConditionRange,
-      SourceLocation IfLoc) override {
+                SourceLocation IfLoc) override {
     PPCallbacks::Elifndef(Loc, ConditionRange, IfLoc);
   }
   void Endif(SourceLocation Loc, SourceLocation IfLoc) override;
@@ -316,8 +316,7 @@ void MacroToEnumCallbacks::FileChanged(SourceLocation Loc,
   CurrentFile = &Files.back();
 }
 
-bool MacroToEnumCallbacks::isInitializer(ArrayRef<Token> MacroTokens)
-{
+bool MacroToEnumCallbacks::isInitializer(ArrayRef<Token> MacroTokens) {
   IntegralLiteralExpressionMatcher Matcher(MacroTokens, LangOpts.C99 == 0);
   bool Matched = Matcher.match();
   bool IsC = !LangOpts.CPlusPlus;
@@ -328,7 +327,6 @@ bool MacroToEnumCallbacks::isInitializer(ArrayRef<Token> MacroTokens)
   return Matched;
 }
 
-
 // Any defined but rejected macro is scanned for identifiers that
 // are to be excluded as enums.
 void MacroToEnumCallbacks::MacroDefined(const Token &MacroNameTok,
@@ -444,8 +442,8 @@ void MacroToEnumCallbacks::invalidateExpressionNames() {
 }
 
 void MacroToEnumCallbacks::EndOfMainFile() {
-    invalidateExpressionNames();
-    issueDiagnostics();
+  invalidateExpressionNames();
+  issueDiagnostics();
 }
 
 void MacroToEnumCallbacks::invalidateRange(SourceRange Range) {
@@ -517,7 +515,8 @@ void MacroToEnumCallbacks::fixEnumMacro(const MacroList &MacroList) const {
 void MacroToEnumCheck::registerPPCallbacks(const SourceManager &SM,
                                            Preprocessor *PP,
                                            Preprocessor *ModuleExpanderPP) {
-  auto Callback = std::make_unique<MacroToEnumCallbacks>(this, getLangOpts(), SM);
+  auto Callback =
+      std::make_unique<MacroToEnumCallbacks>(this, getLangOpts(), SM);
   PPCallback = Callback.get();
   PP->addPPCallbacks(std::move(Callback));
 }
@@ -540,7 +539,7 @@ void MacroToEnumCheck::check(
     const ast_matchers::MatchFinder::MatchResult &Result) {
   auto *TLDecl = Result.Nodes.getNodeAs<Decl>("top");
   if (TLDecl == nullptr)
-      return;
+    return;
 
   SourceRange Range = TLDecl->getSourceRange();
   if (auto *TemplateFn = Result.Nodes.getNodeAs<FunctionTemplateDecl>("top")) {
diff --git a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
index d1d7e9dc..deef358 100644
--- a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
@@ -361,8 +361,7 @@ bool MakeSmartPtrCheck::replaceNew(DiagnosticBuilder &Diag,
       Diag << FixItHint::CreateRemoval(
           SourceRange(NewStart, InitRange.getBegin()));
       Diag << FixItHint::CreateRemoval(SourceRange(InitRange.getEnd(), NewEnd));
-    }
-    else {
+    } else {
       // New array expression with default/value initialization:
       //   smart_ptr<Foo[]>(new int[5]());
       //   smart_ptr<Foo[]>(new Foo[5]());
diff --git a/clang-tools-extra/clang-tidy/modernize/ModernizeTidyModule.cpp b/clang-tools-extra/clang-tidy/modernize/ModernizeTidyModule.cpp
index fc46c72..e872759 100644
--- a/clang-tools-extra/clang-tidy/modernize/ModernizeTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/ModernizeTidyModule.cpp
@@ -110,11 +110,11 @@ public:
     CheckFactories.registerCheck<UseDefaultMemberInitCheck>(
         "modernize-use-default-member-init");
     CheckFactories.registerCheck<UseEmplaceCheck>("modernize-use-emplace");
-    CheckFactories.registerCheck<UseEqualsDefaultCheck>("modernize-use-equals-default");
+    CheckFactories.registerCheck<UseEqualsDefaultCheck>(
+        "modernize-use-equals-default");
     CheckFactories.registerCheck<UseEqualsDeleteCheck>(
         "modernize-use-equals-delete");
-    CheckFactories.registerCheck<UseNodiscardCheck>(
-        "modernize-use-nodiscard");
+    CheckFactories.registerCheck<UseNodiscardCheck>("modernize-use-nodiscard");
     CheckFactories.registerCheck<UseNoexceptCheck>("modernize-use-noexcept");
     CheckFactories.registerCheck<UseNullptrCheck>("modernize-use-nullptr");
     CheckFactories.registerCheck<UseOverrideCheck>("modernize-use-override");
diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp
index 4587b08..1ad31d3 100644
--- a/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp
@@ -141,8 +141,7 @@ void ReplaceAutoPtrCheck::check(const MatchFinder::MatchResult &Result) {
       "auto_ptr")
     return;
 
-  SourceLocation EndLoc =
-      AutoPtrLoc.getLocWithOffset(strlen("auto_ptr") - 1);
+  SourceLocation EndLoc = AutoPtrLoc.getLocWithOffset(strlen("auto_ptr") - 1);
   diag(AutoPtrLoc, "auto_ptr is deprecated, use unique_ptr instead")
       << FixItHint::CreateReplacement(SourceRange(AutoPtrLoc, EndLoc),
                                       "unique_ptr");
diff --git a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
index 7a2d804..f4b6308 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
@@ -38,12 +38,11 @@ size_t getTypeNameLength(bool RemoveStars, StringRef Text) {
     else if (C == '>')
       --TemplateTypenameCntr;
     const CharType NextChar =
-        isAlphanumeric(C)
-            ? Alpha
-            : (isWhitespace(C) ||
-               (!RemoveStars && TemplateTypenameCntr == 0 && C == '*'))
-                  ? Space
-                  : Punctuation;
+        isAlphanumeric(C) ? Alpha
+        : (isWhitespace(C) ||
+           (!RemoveStars && TemplateTypenameCntr == 0 && C == '*'))
+            ? Space
+            : Punctuation;
     if (NextChar != Space) {
       ++NumChars; // Count the non-space character.
       if (LastChar == Space && NextChar == Alpha && BeforeSpace == Alpha)
@@ -444,10 +443,10 @@ void UseAutoCheck::check(const MatchFinder::MatchResult &Result) {
     replaceIterators(Decl, Result.Context);
   } else if (const auto *Decl =
                  Result.Nodes.getNodeAs<DeclStmt>(DeclWithNewId)) {
-    replaceExpr(Decl, Result.Context,
-                [](const Expr *Expr) { return Expr->getType(); },
-                "use auto when initializing with new to avoid "
-                "duplicating the type name");
+    replaceExpr(
+        Decl, Result.Context, [](const Expr *Expr) { return Expr->getType(); },
+        "use auto when initializing with new to avoid "
+        "duplicating the type name");
   } else if (const auto *Decl =
                  Result.Nodes.getNodeAs<DeclStmt>(DeclWithCastId)) {
     replaceExpr(
diff --git a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp
index 430455a..aaf24ea 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.cpp
@@ -98,8 +98,8 @@ auto hasWantedType(llvm::ArrayRef<StringRef> TypeNames) {
 
 // Matches member call expressions of the named method on the listed container
 // types.
-auto cxxMemberCallExprOnContainer(
-    StringRef MethodName, llvm::ArrayRef<StringRef> ContainerNames) {
+auto cxxMemberCallExprOnContainer(StringRef MethodName,
+                                  llvm::ArrayRef<StringRef> ContainerNames) {
   return cxxMemberCallExpr(
       hasDeclaration(functionDecl(hasName(MethodName))),
       on(hasTypeOrPointeeType(hasWantedType(ContainerNames))));
diff --git a/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.h b/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.h
index 44f26c2..04c2177 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.h
@@ -1,4 +1,4 @@
-//===--- UseEqualsDefaultCheck.h - clang-tidy--------------------------*- C++ -*-===//
+//===--- UseEqualsDefaultCheck.h - clang-tidy---------------------*- C++-*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.h b/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.h
index 8545aa2..64f60351 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.h
@@ -1,4 +1,4 @@
-//===--- UseEqualsDeleteCheck.h - clang-tidy---------------------------*- C++ -*-===//
+//===--- UseEqualsDeleteCheck.h - clang-tidy----------------------*- C++-*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
index a6b00be..c38fb3a 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
@@ -229,7 +229,7 @@ public:
       return true;
     }
 
-    auto* CastSubExpr = C->getSubExpr()->IgnoreParens();
+    auto *CastSubExpr = C->getSubExpr()->IgnoreParens();
     // Ignore cast expressions which cast nullptr literal.
     if (isa<CXXNullPtrLiteralExpr>(CastSubExpr)) {
       return true;
diff --git a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
index dd7140b7..ced4825 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
@@ -316,7 +316,6 @@ findReturnTypeAndCVSourceRange(const FunctionDecl &F, const TypeLoc &ReturnLoc,
     return {};
   }
 
-
   // If the return type has no local qualifiers, it's source range is accurate.
   if (!hasAnyNestedLocalQualifiers(F.getReturnType()))
     return ReturnTypeRange;
diff --git a/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.h b/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.h
index 6f02bec..80f0221 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.h
@@ -26,6 +26,7 @@ public:
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
   void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
+
 private:
   const bool SafeMode;
 };
diff --git a/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.cpp
index 109d701..1e0a0a5 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.cpp
@@ -39,10 +39,10 @@ void UseUncaughtExceptionsCheck::registerMatchers(MatchFinder *Finder) {
                      this);
   // CallExpr in initialisation list: warning, fix-it with avoiding narrowing
   // conversions.
-  Finder->addMatcher(callExpr(DirectCallToUncaughtException,
-                              hasAncestor(initListExpr()))
-                         .bind("init_call_expr"),
-                     this);
+  Finder->addMatcher(
+      callExpr(DirectCallToUncaughtException, hasAncestor(initListExpr()))
+          .bind("init_call_expr"),
+      this);
 }
 
 void UseUncaughtExceptionsCheck::check(const MatchFinder::MatchResult &Result) {
diff --git a/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.h b/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.h
index 79b475c..4867752 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.h
@@ -13,9 +13,10 @@
 
 namespace clang::tidy::modernize {
 
-/// This check will warn on calls to std::uncaught_exception and replace them with calls to
-/// std::uncaught_exceptions, since std::uncaught_exception was deprecated in C++17. In case of
-/// macro ID there will be only a warning without fixits.
+/// This check will warn on calls to std::uncaught_exception and replace them
+/// with calls to std::uncaught_exceptions, since std::uncaught_exception was
+/// deprecated in C++17. In case of macro ID there will be only a warning
+/// without fixits.
 ///
 /// For the user-facing documentation see:
 /// http://clang.llvm.org/extra/clang-tidy/checks/modernize/use-uncaught-exceptions.html
diff --git a/clang-tools-extra/clang-tidy/objc/AvoidNSErrorInitCheck.h b/clang-tools-extra/clang-tidy/objc/AvoidNSErrorInitCheck.h
index 72a6c65..b343cb0 100644
--- a/clang-tools-extra/clang-tidy/objc/AvoidNSErrorInitCheck.h
+++ b/clang-tools-extra/clang-tidy/objc/AvoidNSErrorInitCheck.h
@@ -19,7 +19,7 @@ namespace clang::tidy::objc {
 /// For the user-facing documentation see:
 /// http://clang.llvm.org/extra/clang-tidy/checks/objc/avoid-nserror-init.html
 class AvoidNSErrorInitCheck : public ClangTidyCheck {
- public:
+public:
   AvoidNSErrorInitCheck(StringRef Name, ClangTidyContext *Context)
       : ClangTidyCheck(Name, Context) {}
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
@@ -31,4 +31,4 @@ class AvoidNSErrorInitCheck : public ClangTidyCheck {
 
 } // namespace clang::tidy::objc
 
-#endif  // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_AVOIDNSERRORINITCHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_OBJC_AVOIDNSERRORINITCHECK_H
diff --git a/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.cpp b/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.cpp
index 579aa29..089538d 100644
--- a/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.cpp
@@ -35,14 +35,11 @@ constexpr char DefaultForbiddenSuperClassNames[] =
 
 } // namespace
 
-ForbiddenSubclassingCheck::ForbiddenSubclassingCheck(
-    StringRef Name,
-    ClangTidyContext *Context)
+ForbiddenSubclassingCheck::ForbiddenSubclassingCheck(StringRef Name,
+                                                     ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
-      ForbiddenSuperClassNames(
-          utils::options::parseStringList(
-              Options.get("ClassNames", DefaultForbiddenSuperClassNames))) {
-}
+      ForbiddenSuperClassNames(utils::options::parseStringList(
+          Options.get("ClassNames", DefaultForbiddenSuperClassNames))) {}
 
 void ForbiddenSubclassingCheck::registerMatchers(MatchFinder *Finder) {
   Finder->addMatcher(
@@ -53,27 +50,22 @@ void ForbiddenSubclassingCheck::registerMatchers(MatchFinder *Finder) {
       this);
 }
 
-void ForbiddenSubclassingCheck::check(
-    const MatchFinder::MatchResult &Result) {
-  const auto *SubClass = Result.Nodes.getNodeAs<ObjCInterfaceDecl>(
-      "subclass");
+void ForbiddenSubclassingCheck::check(const MatchFinder::MatchResult &Result) {
+  const auto *SubClass = Result.Nodes.getNodeAs<ObjCInterfaceDecl>("subclass");
   assert(SubClass != nullptr);
-  const auto *SuperClass = Result.Nodes.getNodeAs<ObjCInterfaceDecl>(
-      "superclass");
+  const auto *SuperClass =
+      Result.Nodes.getNodeAs<ObjCInterfaceDecl>("superclass");
   assert(SuperClass != nullptr);
   diag(SubClass->getLocation(),
        "Objective-C interface %0 subclasses %1, which is not "
        "intended to be subclassed")
-      << SubClass
-      << SuperClass;
+      << SubClass << SuperClass;
 }
 
 void ForbiddenSubclassingCheck::storeOptions(
     ClangTidyOptions::OptionMap &Opts) {
-  Options.store(
-      Opts,
-      "ForbiddenSuperClassNames",
-      utils::options::serializeStringList(ForbiddenSuperClassNames));
+  Options.store(Opts, "ForbiddenSuperClassNames",
+                utils::options::serializeStringList(ForbiddenSuperClassNames));
 }
 
 } // namespace clang::tidy::objc
diff --git a/clang-tools-extra/clang-tidy/objc/ObjCTidyModule.cpp b/clang-tools-extra/clang-tidy/objc/ObjCTidyModule.cpp
index b6f8888..35113f8 100644
--- a/clang-tools-extra/clang-tidy/objc/ObjCTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/objc/ObjCTidyModule.cpp
@@ -35,22 +35,19 @@ public:
         "objc-dealloc-in-category");
     CheckFactories.registerCheck<ForbiddenSubclassingCheck>(
         "objc-forbidden-subclassing");
-    CheckFactories.registerCheck<MissingHashCheck>(
-        "objc-missing-hash");
+    CheckFactories.registerCheck<MissingHashCheck>("objc-missing-hash");
     CheckFactories.registerCheck<NSDateFormatterCheck>("objc-nsdate-formatter");
     CheckFactories.registerCheck<NSInvocationArgumentLifetimeCheck>(
         "objc-nsinvocation-argument-lifetime");
     CheckFactories.registerCheck<PropertyDeclarationCheck>(
         "objc-property-declaration");
-    CheckFactories.registerCheck<SuperSelfCheck>(
-        "objc-super-self");
+    CheckFactories.registerCheck<SuperSelfCheck>("objc-super-self");
   }
 };
 
 // Register the ObjCTidyModule using this statically initialized variable.
-static ClangTidyModuleRegistry::Add<ObjCModule> X(
-    "objc-module",
-    "Adds Objective-C lint checks.");
+static ClangTidyModuleRegistry::Add<ObjCModule>
+    X("objc-module", "Adds Objective-C lint checks.");
 
 } // namespace objc
 
diff --git a/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp b/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp
index cda24de..3b847f5 100644
--- a/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp
@@ -88,7 +88,7 @@ bool prefixedPropertyNameValid(llvm::StringRef PropertyName) {
   auto RegexExp = llvm::Regex(llvm::StringRef(validPropertyNameRegex(false)));
   return RegexExp.match(PropertyName.substr(Start + 1));
 }
-}  // namespace
+} // namespace
 
 void PropertyDeclarationCheck::registerMatchers(MatchFinder *Finder) {
   Finder->addMatcher(objcPropertyDecl(
diff --git a/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.h b/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.h
index 93ed32e..83af95c 100644
--- a/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.h
@@ -25,7 +25,7 @@ namespace clang::tidy::performance {
 class FasterStringFindCheck : public ClangTidyCheck {
 public:
   FasterStringFindCheck(StringRef Name, ClangTidyContext *Context);
-  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override{
+  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return LangOpts.CPlusPlus;
   }
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
diff --git a/clang-tools-extra/clang-tidy/performance/ForRangeCopyCheck.h b/clang-tools-extra/clang-tidy/performance/ForRangeCopyCheck.h
index ab9b89d..8fabbfa 100644
--- a/clang-tools-extra/clang-tidy/performance/ForRangeCopyCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/ForRangeCopyCheck.h
@@ -20,7 +20,7 @@ namespace clang::tidy::performance {
 class ForRangeCopyCheck : public ClangTidyCheck {
 public:
   ForRangeCopyCheck(StringRef Name, ClangTidyContext *Context);
-  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override{
+  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return LangOpts.CPlusPlus11;
   }
   void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
diff --git a/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.cpp b/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.cpp
index 86fca07..1ecf1e1 100644
--- a/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.cpp
@@ -65,8 +65,7 @@ void ImplicitConversionInLoopCheck::check(
     const MatchFinder::MatchResult &Result) {
   const auto *VD = Result.Nodes.getNodeAs<VarDecl>("faulty-var");
   const auto *Init = Result.Nodes.getNodeAs<Expr>("init");
-  const auto *OperatorCall =
-      Result.Nodes.getNodeAs<Expr>("operator-call");
+  const auto *OperatorCall = Result.Nodes.getNodeAs<Expr>("operator-call");
 
   if (const auto *Cleanup = dyn_cast<ExprWithCleanups>(Init))
     Init = Cleanup->getSubExpr();
diff --git a/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.h b/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.h
index 2f3c9ae..d176407 100644
--- a/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/ImplicitConversionInLoopCheck.h
@@ -20,9 +20,9 @@ class ImplicitConversionInLoopCheck : public ClangTidyCheck {
 public:
   ImplicitConversionInLoopCheck(StringRef Name, ClangTidyContext *Context)
       : ClangTidyCheck(Name, Context) {}
-      bool isLanguageVersionSupported(const LangOptions &LangOpts) const override{
-        return LangOpts.CPlusPlus11;
-      }
+  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
+    return LangOpts.CPlusPlus11;
+  }
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
 
diff --git a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp
index 94cb7ec3..9692fcb 100644
--- a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp
@@ -126,15 +126,14 @@ void InefficientVectorOperationCheck::addMatcher(
   //
   // FIXME: Support more types of counter-based loops like decrement loops.
   Finder->addMatcher(
-      forStmt(
-          hasLoopInit(LoopVarInit),
-          hasCondition(binaryOperator(
-              hasOperatorName("<"), hasLHS(RefersToLoopVar),
-              hasRHS(expr(unless(hasDescendant(expr(RefersToLoopVar))))
-                         .bind(LoopEndExprName)))),
-          hasIncrement(unaryOperator(hasOperatorName("++"),
-                                     hasUnaryOperand(RefersToLoopVar))),
-          HasInterestingLoopBody, InInterestingCompoundStmt)
+      forStmt(hasLoopInit(LoopVarInit),
+              hasCondition(binaryOperator(
+                  hasOperatorName("<"), hasLHS(RefersToLoopVar),
+                  hasRHS(expr(unless(hasDescendant(expr(RefersToLoopVar))))
+                             .bind(LoopEndExprName)))),
+              hasIncrement(unaryOperator(hasOperatorName("++"),
+                                         hasUnaryOperand(RefersToLoopVar))),
+              HasInterestingLoopBody, InInterestingCompoundStmt)
           .bind(LoopCounterName),
       this);
 
@@ -179,7 +178,7 @@ void InefficientVectorOperationCheck::registerMatchers(MatchFinder *Finder) {
 
 void InefficientVectorOperationCheck::check(
     const MatchFinder::MatchResult &Result) {
-  auto* Context = Result.Context;
+  auto *Context = Result.Context;
   if (Context->getDiagnostics().hasUncompilableErrorOccurred())
     return;
 
diff --git a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.h b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.h
index 7a2745f..4f45ff4 100644
--- a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.h
@@ -24,7 +24,7 @@ namespace clang::tidy::performance {
 class InefficientVectorOperationCheck : public ClangTidyCheck {
 public:
   InefficientVectorOperationCheck(StringRef Name, ClangTidyContext *Context);
-  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override{
+  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return LangOpts.CPlusPlus;
   }
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp
index ce360dde..b6f1981 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp
@@ -111,10 +111,11 @@ AST_MATCHER_FUNCTION(StatementMatcher, isConstRefReturningFunctionCall) {
   // an alias to one of its arguments and the arguments need to be checked
   // for const use as well.
   return callExpr(argumentCountIs(0),
-                  callee(functionDecl(returns(hasCanonicalType(matchers::isReferenceToConst())),
+                  callee(functionDecl(returns(hasCanonicalType(
+                                          matchers::isReferenceToConst())),
                                       unless(cxxMethodDecl(unless(isStatic()))))
-                         .bind(FunctionDeclId)))
-         .bind(InitFunctionCallId);
+                             .bind(FunctionDeclId)))
+      .bind(InitFunctionCallId);
 }
 
 AST_MATCHER_FUNCTION_P(StatementMatcher, initializerReturnsReferenceToConst,
@@ -234,12 +235,14 @@ UnnecessaryCopyInitialization::UnnecessaryCopyInitialization(
           Options.get("ExcludedContainerTypes", ""))) {}
 
 void UnnecessaryCopyInitialization::registerMatchers(MatchFinder *Finder) {
-  auto LocalVarCopiedFrom = [this](const ast_matchers::internal::Matcher<Expr> &CopyCtorArg) {
-    return compoundStmt(
-               forEachDescendant(
-                   declStmt(
-                       unless(has(decompositionDecl())),
-                       has(varDecl(hasLocalStorage(),
+  auto LocalVarCopiedFrom =
+      [this](const ast_matchers::internal::Matcher<Expr> &CopyCtorArg) {
+        return compoundStmt(
+                   forEachDescendant(
+                       declStmt(
+                           unless(has(decompositionDecl())),
+                           has(varDecl(
+                                   hasLocalStorage(),
                                    hasType(qualType(
                                        hasCanonicalType(allOf(
                                            matchers::isExpensiveToCopy(),
@@ -256,10 +259,10 @@ void UnnecessaryCopyInitialization::registerMatchers(MatchFinder *Finder) {
                                                isCopyConstructor())),
                                            hasArgument(0, CopyCtorArg))
                                            .bind("ctorCall"))))
-                               .bind("newVarDecl")))
-                       .bind("declStmt")))
-        .bind("blockStmt");
-  };
+                                   .bind("newVarDecl")))
+                           .bind("declStmt")))
+            .bind("blockStmt");
+      };
 
   Finder->addMatcher(
       LocalVarCopiedFrom(anyOf(
diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.h b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.h
index ab0f1ecf..38f756f 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.h
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.h
@@ -25,7 +25,7 @@ namespace clang::tidy::performance {
 class UnnecessaryCopyInitialization : public ClangTidyCheck {
 public:
   UnnecessaryCopyInitialization(StringRef Name, ClangTidyContext *Context);
-  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override{
+  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return LangOpts.CPlusPlus;
   }
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
diff --git a/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.h b/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.h
index c254a37..92fc0af 100644
--- a/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.h
+++ b/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.h
@@ -30,7 +30,7 @@ public:
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
 
- private:
+private:
   llvm::SmallString<32> Std;
   const bool Suggest;
 };
diff --git a/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.h b/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.h
index c3d779e..e329b3d 100644
--- a/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.h
@@ -19,16 +19,16 @@ namespace clang::tidy::readability {
 /// For the user-facing documentation see:
 /// http://clang.llvm.org/extra/clang-tidy/checks/readability/const-return-type.html
 class ConstReturnTypeCheck : public ClangTidyCheck {
- public:
-   ConstReturnTypeCheck(StringRef Name, ClangTidyContext *Context)
-       : ClangTidyCheck(Name, Context),
-         IgnoreMacros(Options.getLocalOrGlobal("IgnoreMacros", true)) {}
-   void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
-   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
-   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+public:
+  ConstReturnTypeCheck(StringRef Name, ClangTidyContext *Context)
+      : ClangTidyCheck(Name, Context),
+        IgnoreMacros(Options.getLocalOrGlobal("IgnoreMacros", true)) {}
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
+  void registerMatchers(ast_matchers::MatchFinder *Finder) override;
+  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
 
- private:
-   const bool IgnoreMacros;
+private:
+  const bool IgnoreMacros;
 };
 
 } // namespace clang::tidy::readability
diff --git a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp
index 5cc22a2..10aa779 100644
--- a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp
@@ -131,8 +131,8 @@ findDifferingParamsInDeclaration(const FunctionDecl *ParameterSourceDeclaration,
 
 InconsistentDeclarationsContainer
 findInconsistentDeclarations(const FunctionDecl *OriginalDeclaration,
-                            const FunctionDecl *ParameterSourceDeclaration,
-                            SourceManager &SM, bool Strict) {
+                             const FunctionDecl *ParameterSourceDeclaration,
+                             SourceManager &SM, bool Strict) {
   InconsistentDeclarationsContainer InconsistentDeclarations;
   SourceLocation ParameterSourceLocation =
       ParameterSourceDeclaration->getLocation();
diff --git a/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.h b/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.h
index 7978784..70a1788 100644
--- a/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.h
@@ -43,11 +43,12 @@ private:
 
   bool isBitFieldWidth(const clang::ast_matchers::MatchFinder::MatchResult &,
                        const FloatingLiteral &) const {
-     return false;
+    return false;
   }
 
-  bool isBitFieldWidth(const clang::ast_matchers::MatchFinder::MatchResult &Result,
-                       const IntegerLiteral &Literal) const;
+  bool
+  isBitFieldWidth(const clang::ast_matchers::MatchFinder::MatchResult &Result,
+                  const IntegerLiteral &Literal) const;
 
   bool isUserDefinedLiteral(
       const clang::ast_matchers::MatchFinder::MatchResult &Result,
diff --git a/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp b/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp
index 00999ee..91a08b9 100644
--- a/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp
@@ -50,10 +50,9 @@ std::optional<Token> findQualToken(const VarDecl *Decl, Qualifier Qual,
   if (FileRange.isInvalid())
     return std::nullopt;
 
-  tok::TokenKind Tok =
-      Qual == Qualifier::Const
-          ? tok::kw_const
-          : Qual == Qualifier::Volatile ? tok::kw_volatile : tok::kw_restrict;
+  tok::TokenKind Tok = Qual == Qualifier::Const      ? tok::kw_const
+                       : Qual == Qualifier::Volatile ? tok::kw_volatile
+                                                     : tok::kw_restrict;
 
   return utils::lexer::getQualifyingToken(Tok, FileRange, *Result.Context,
                                           *Result.SourceManager);
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.cpp
index 44053eb..a70719f 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.cpp
@@ -13,7 +13,8 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::readability {
 
-void RedundantFunctionPtrDereferenceCheck::registerMatchers(MatchFinder *Finder) {
+void RedundantFunctionPtrDereferenceCheck::registerMatchers(
+    MatchFinder *Finder) {
   Finder->addMatcher(
       traverse(TK_AsIs, unaryOperator(hasOperatorName("*"),
                                       has(implicitCastExpr(hasCastKind(
@@ -22,7 +23,8 @@ void RedundantFunctionPtrDereferenceCheck::registerMatchers(MatchFinder *Finder)
       this);
 }
 
-void RedundantFunctionPtrDereferenceCheck::check(const MatchFinder::MatchResult &Result) {
+void RedundantFunctionPtrDereferenceCheck::check(
+    const MatchFinder::MatchResult &Result) {
   const auto *Operator = Result.Nodes.getNodeAs<UnaryOperator>("op");
   diag(Operator->getOperatorLoc(),
        "redundant repeated dereference of function pointer")
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.h
index fdcf821..a04e9c1 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.h
@@ -19,7 +19,8 @@ namespace clang::tidy::readability {
 /// http://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-function-ptr-dereference.html
 class RedundantFunctionPtrDereferenceCheck : public ClangTidyCheck {
 public:
-  RedundantFunctionPtrDereferenceCheck(StringRef Name, ClangTidyContext *Context)
+  RedundantFunctionPtrDereferenceCheck(StringRef Name,
+                                       ClangTidyContext *Context)
       : ClangTidyCheck(Name, Context) {}
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.cpp
index 57f13db..c90d152 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.cpp
@@ -52,12 +52,11 @@ void RedundantStringCStrCheck::registerMatchers(
   const auto StringConstructorExpr = expr(anyOf(
       cxxConstructExpr(argumentCountIs(1),
                        hasDeclaration(cxxMethodDecl(hasName("basic_string")))),
-      cxxConstructExpr(
-          argumentCountIs(2),
-          hasDeclaration(cxxMethodDecl(hasName("basic_string"))),
-          // If present, the second argument is the alloc object which must not
-          // be present explicitly.
-          hasArgument(1, cxxDefaultArgExpr()))));
+      cxxConstructExpr(argumentCountIs(2),
+                       hasDeclaration(cxxMethodDecl(hasName("basic_string"))),
+                       // If present, the second argument is the alloc object
+                       // which must not be present explicitly.
+                       hasArgument(1, cxxDefaultArgExpr()))));
 
   // Match string constructor.
   const auto StringViewConstructorExpr = cxxConstructExpr(
@@ -105,8 +104,9 @@ void RedundantStringCStrCheck::registerMatchers(
 
   // Detect: 'dst.append(str.c_str())'  ->  'dst.append(str)'
   Finder->addMatcher(
-      cxxMemberCallExpr(on(StringExpr), callee(decl(cxxMethodDecl(hasAnyName(
-                                            "append", "assign", "compare")))),
+      cxxMemberCallExpr(on(StringExpr),
+                        callee(decl(cxxMethodDecl(
+                            hasAnyName("append", "assign", "compare")))),
                         argumentCountIs(1), hasArgument(0, StringCStrCallExpr)),
       this);
 
diff --git a/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.h b/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.h
index 83bab16..deffb09 100644
--- a/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.h
@@ -25,7 +25,7 @@ public:
   }
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
-  void storeOptions(ClangTidyOptions::OptionMap& Opts) override;
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
   std::optional<TraversalKind> getCheckTraversalKind() const override {
     return TK_IgnoreUnlessSpelledInSource;
   }
diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
index 7c3aee2..dc36db4 100644
--- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
+++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
@@ -337,8 +337,7 @@ Allow empty enabled checks. This suppresses
 the "no checks enabled" error when disabling
 all of the checks.
 )"),
-                                         cl::init(false),
-                                         cl::cat(ClangTidyCategory));
+                                   cl::init(false), cl::cat(ClangTidyCategory));
 
 namespace clang::tidy {
 
@@ -370,8 +369,8 @@ static void printStats(const ClangTidyStats &Stats) {
   }
 }
 
-static std::unique_ptr<ClangTidyOptionsProvider> createOptionsProvider(
-   llvm::IntrusiveRefCntPtr<vfs::FileSystem> FS) {
+static std::unique_ptr<ClangTidyOptionsProvider>
+createOptionsProvider(llvm::IntrusiveRefCntPtr<vfs::FileSystem> FS) {
   ClangTidyGlobalOptions GlobalOptions;
   if (std::error_code Err = parseLineFilter(LineFilter, GlobalOptions)) {
     llvm::errs() << "Invalid LineFilter: " << Err.message() << "\n\nUsage:\n";
diff --git a/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp b/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp
index 83248e1..db1ea1b 100644
--- a/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp
+++ b/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp
@@ -48,8 +48,7 @@ StringRef makeCanonicalName(StringRef Str, IncludeSorter::IncludeStyle Style) {
     if (StartIndex == StringRef::npos) {
       StartIndex = 0;
     }
-    return Canonical.substr(
-        0, Canonical.find_first_of('+', StartIndex));
+    return Canonical.substr(0, Canonical.find_first_of('+', StartIndex));
   }
   return removeFirstSuffix(
       removeFirstSuffix(Str, {".cc", ".cpp", ".c", ".h", ".hpp"}),
diff --git a/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp b/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp
index 68574d2..44db0c2 100644
--- a/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp
+++ b/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp
@@ -41,8 +41,7 @@ std::optional<bool> isExpensiveToCopy(QualType Type,
     return std::nullopt;
   return !Type.isTriviallyCopyableType(Context) &&
          !classHasTrivialCopyAndDestroy(Type) &&
-         !hasDeletedCopyConstructor(Type) &&
-         !Type->isObjCLifetimeType();
+         !hasDeletedCopyConstructor(Type) && !Type->isObjCLifetimeType();
 }
 
 bool recordIsTriviallyDefaultConstructible(const RecordDecl &RecordDecl,
diff --git a/clang-tools-extra/clangd/InlayHints.cpp b/clang-tools-extra/clangd/InlayHints.cpp
index bdab2b8..20a2386 100644
--- a/clang-tools-extra/clangd/InlayHints.cpp
+++ b/clang-tools-extra/clangd/InlayHints.cpp
@@ -33,6 +33,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/ADT/identity.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -375,7 +376,11 @@ static FunctionProtoTypeLoc getPrototypeLoc(Expr *Fn) {
   }
 
   if (auto F = Target.getAs<FunctionProtoTypeLoc>()) {
-    return F;
+    // In some edge cases the AST can contain a "trivial" FunctionProtoTypeLoc
+    // which has null parameters. Avoid these as they don't contain useful
+    // information.
+    if (llvm::all_of(F.getParams(), llvm::identity<ParmVarDecl *>()))
+      return F;
   }
 
   return {};
diff --git a/clang-tools-extra/clangd/unittests/InlayHintTests.cpp b/clang-tools-extra/clangd/unittests/InlayHintTests.cpp
index c3331d2..e0cd955 100644
--- a/clang-tools-extra/clangd/unittests/InlayHintTests.cpp
+++ b/clang-tools-extra/clangd/unittests/InlayHintTests.cpp
@@ -1011,11 +1011,16 @@ TEST(ParameterHints, FunctionPointer) {
     f3_t f3;
     using f4_t = void(__stdcall *)(int param);
     f4_t f4;
+    __attribute__((noreturn)) f4_t f5;
     void bar() {
       f1($f1[[42]]);
       f2($f2[[42]]);
       f3($f3[[42]]);
       f4($f4[[42]]);
+      // This one runs into an edge case in clang's type model
+      // and we can't extract the parameter name. But at least
+      // we shouldn't crash.
+      f5(42);
     }
   )cpp",
       ExpectedHint{"param: ", "f1"}, ExpectedHint{"param: ", "f2"},
diff --git a/clang-tools-extra/clangd/unittests/PrerequisiteModulesTest.cpp b/clang-tools-extra/clangd/unittests/PrerequisiteModulesTest.cpp
index 03e6576..ae2d98c 100644
--- a/clang-tools-extra/clangd/unittests/PrerequisiteModulesTest.cpp
+++ b/clang-tools-extra/clangd/unittests/PrerequisiteModulesTest.cpp
@@ -247,13 +247,11 @@ import Dep;
       ProjectModules->getRequiredModules(getFullPath("M.cppm")).empty());
 
   // Set the mangler to filter out the invalid flag
-  ProjectModules->setCommandMangler(
-      [](tooling::CompileCommand &Command, PathRef) {
-        auto const It =
-            std::find(Command.CommandLine.begin(), Command.CommandLine.end(),
-                      "-invalid-unknown-flag");
-        Command.CommandLine.erase(It);
-      });
+  ProjectModules->setCommandMangler([](tooling::CompileCommand &Command,
+                                       PathRef) {
+    auto const It = llvm::find(Command.CommandLine, "-invalid-unknown-flag");
+    Command.CommandLine.erase(It);
+  });
 
   // And now it returns a non-empty list of required modules since the
   // compilation succeeded
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 7354482..0ba301f 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -1009,6 +1009,7 @@ to ``float``; see below for more information on this emulation.
   * 64-bit ARM (AArch64)
   * RISC-V
   * X86 (when SSE2 is available)
+  * LoongArch
 
 (For X86, SSE2 is available on 64-bit and all recent 32-bit processors.)
 
@@ -6501,3 +6502,77 @@ qualifications.
 Note, Clang does not allow an ``_Atomic`` function type because
 of explicit constraints against atomically qualified (arrays and) function
 types.
+
+
+Underspecified Object Declarations in C
+=======================================
+
+C23 introduced the notion of `underspecified object declarations <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3006.htm>`_
+(note, the final standards text is different from WG14 N3006 due to changes
+during national body comment review). When an object is declared with the
+``constexpr`` storage class specifier or has a deduced type (with the ``auto``
+specifier), it is said to be "underspecified". Underspecified declarations have
+different requirements than non-underspecified declarations. In particular, the
+identifier being declared cannot be used in its initialization. e.g.,
+
+.. code-block:: c
+
+  auto x = x; // Invalid
+  constexpr int y = y; // Invalid
+
+The standard leaves it implementation-defined whether an underspecified
+declaration may introduce additional identifiers as part of the declaration.
+
+Clang allows additional identifiers to be declared in the following cases:
+
+* A compound literal may introduce a new type. e.g.,
+
+.. code-block:: c
+
+  auto x = (struct S { int x, y; }){ 1, 2 };      // Accepted by Clang
+  constexpr int i = (struct T { int x; }){ 1 }.x; // Accepted by Clang
+
+* The type specifier for a ``constexpr`` declaration may define a new type.
+  e.g.,
+
+.. code-block:: c
+
+  constexpr struct S { int x; } s = { 1 }; // Accepted by Clang
+
+* A function declarator may be declared with parameters, including parameters
+  which introduce a new type. e.g.,
+
+.. code-block:: c
+
+  constexpr int (*fp)(int x) = nullptr;              // Accepted by Clang
+  auto f = (void (*)(struct S { int x; } s))nullptr; // Accepted by Clang
+
+* The initializer may contain a GNU statement expression which defines new
+  types or objects. e.g.,
+
+.. code-block:: c
+
+  constexpr int i = ({                              // Accepted by Clang
+    constexpr int x = 12;
+    constexpr struct S { int x; } s = { x };
+    s.x;
+  });
+  auto x = ({ struct S { int x; } s = { 0 }; s; }); // Accepted by Clang
+
+Clang intentionally does not implement the changed scoping rules from C23
+for underspecified declarations. Doing so would significantly complicate the
+implementation in order to get reasonable diagnostic behavior and also means
+Clang fails to reject some code that should be rejected. e.g.,
+
+.. code-block:: c
+
+  // This should be rejected because 'x' is not in scope within the initializer
+  // of an underspecified declaration. Clang accepts because it treats the scope
+  // of the identifier as beginning immediately after the declarator, same as with
+  // a non-underspecified declaration.
+  constexpr int x = sizeof(x);
+
+  // Clang rejects this code with a diagnostic about using the variable within its
+  // own initializer rather than rejecting the code with an undeclared identifier
+  // diagnostic.
+  auto x = x;
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 322686f..beed0da 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -157,6 +157,8 @@ Resolutions to C++ Defect Reports
   `constraint-expression <https://cplusplus.github.io/CWG/issues/2517.html>`_.
 - Implemented `CWG3005 Function parameters should never be name-independent <https://wg21.link/CWG3005>`_.
 
+- Implemented `CWG2496 ref-qualifiers and virtual overriding <https://wg21.link/CWG2496>`_.
+
 C Language Changes
 ------------------
 
@@ -289,6 +291,8 @@ C23 Feature Support
   directive. Fixes #GH126940.
 - Fixed a crash when a declaration of a ``constexpr`` variable with an invalid
   type. Fixes #GH140887
+- Documented `WG14 N3006 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3006.htm>`_
+  which clarified how Clang is handling underspecified object declarations.
 
 C11 Feature Support
 ^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/AST/DeclContextInternals.h b/clang/include/clang/AST/DeclContextInternals.h
index b17b7627..a0f3788 100644
--- a/clang/include/clang/AST/DeclContextInternals.h
+++ b/clang/include/clang/AST/DeclContextInternals.h
@@ -177,13 +177,12 @@ public:
       if (ND->isFromASTFile())
         return true;
       // FIXME: Can we get rid of this loop completely?
-      for (NamedDecl *D : Decls)
+      return llvm::any_of(Decls, [ND](NamedDecl *D) {
         // Only replace the local declaration if the external declaration has
-        // higher visibilities.
-        if (D->getModuleOwnershipKind() <= ND->getModuleOwnershipKind() &&
-            D->declarationReplaces(ND, /*IsKnownNewer=*/false))
-          return true;
-      return false;
+        // higher visiblities.
+        return D->getModuleOwnershipKind() <= ND->getModuleOwnershipKind() &&
+               D->declarationReplaces(ND, /*IsKnownNewer=*/false);
+      });
     });
 
     // Don't have any pending external decls any more.
diff --git a/clang/include/clang/AST/Stmt.h b/clang/include/clang/AST/Stmt.h
index 48a6aea..6c4bd6f 100644
--- a/clang/include/clang/AST/Stmt.h
+++ b/clang/include/clang/AST/Stmt.h
@@ -1953,10 +1953,6 @@ class CaseStmt final
     return NumMandatoryStmtPtr + caseStmtIsGNURange();
   }
 
-  unsigned numTrailingObjects(OverloadToken<SourceLocation>) const {
-    return caseStmtIsGNURange();
-  }
-
   unsigned lhsOffset() const { return LhsOffset; }
   unsigned rhsOffset() const { return LhsOffset + caseStmtIsGNURange(); }
   unsigned subStmtOffset() const { return rhsOffset() + SubStmtOffsetFromRhs; }
@@ -2228,10 +2224,8 @@ class AttributedStmt final
     std::fill_n(getAttrArrayPtr(), NumAttrs, nullptr);
   }
 
-  const Attr *const *getAttrArrayPtr() const {
-    return getTrailingObjects<const Attr *>();
-  }
-  const Attr **getAttrArrayPtr() { return getTrailingObjects<const Attr *>(); }
+  const Attr *const *getAttrArrayPtr() const { return getTrailingObjects(); }
+  const Attr **getAttrArrayPtr() { return getTrailingObjects(); }
 
 public:
   static AttributedStmt *Create(const ASTContext &C, SourceLocation Loc,
@@ -2543,7 +2537,7 @@ class SwitchStmt final : public Stmt,
   SourceLocation LParenLoc;
   SourceLocation RParenLoc;
 
-  unsigned numTrailingObjects(OverloadToken<Stmt *>) const {
+  unsigned numTrailingStatements() const {
     return NumMandatoryStmtPtr + hasInitStorage() + hasVarStorage();
   }
 
@@ -2579,40 +2573,34 @@ public:
   bool hasVarStorage() const { return SwitchStmtBits.HasVar; }
 
   Expr *getCond() {
-    return reinterpret_cast<Expr *>(getTrailingObjects<Stmt *>()[condOffset()]);
+    return reinterpret_cast<Expr *>(getTrailingObjects()[condOffset()]);
   }
 
   const Expr *getCond() const {
-    return reinterpret_cast<Expr *>(getTrailingObjects<Stmt *>()[condOffset()]);
+    return reinterpret_cast<Expr *>(getTrailingObjects()[condOffset()]);
   }
 
   void setCond(Expr *Cond) {
-    getTrailingObjects<Stmt *>()[condOffset()] = reinterpret_cast<Stmt *>(Cond);
+    getTrailingObjects()[condOffset()] = reinterpret_cast<Stmt *>(Cond);
   }
 
-  Stmt *getBody() { return getTrailingObjects<Stmt *>()[bodyOffset()]; }
-  const Stmt *getBody() const {
-    return getTrailingObjects<Stmt *>()[bodyOffset()];
-  }
+  Stmt *getBody() { return getTrailingObjects()[bodyOffset()]; }
+  const Stmt *getBody() const { return getTrailingObjects()[bodyOffset()]; }
 
-  void setBody(Stmt *Body) {
-    getTrailingObjects<Stmt *>()[bodyOffset()] = Body;
-  }
+  void setBody(Stmt *Body) { getTrailingObjects()[bodyOffset()] = Body; }
 
   Stmt *getInit() {
-    return hasInitStorage() ? getTrailingObjects<Stmt *>()[initOffset()]
-                            : nullptr;
+    return hasInitStorage() ? getTrailingObjects()[initOffset()] : nullptr;
   }
 
   const Stmt *getInit() const {
-    return hasInitStorage() ? getTrailingObjects<Stmt *>()[initOffset()]
-                            : nullptr;
+    return hasInitStorage() ? getTrailingObjects()[initOffset()] : nullptr;
   }
 
   void setInit(Stmt *Init) {
     assert(hasInitStorage() &&
            "This switch statement has no storage for an init statement!");
-    getTrailingObjects<Stmt *>()[initOffset()] = Init;
+    getTrailingObjects()[initOffset()] = Init;
   }
 
   /// Retrieve the variable declared in this "switch" statement, if any.
@@ -2636,20 +2624,20 @@ public:
   /// If this SwitchStmt has a condition variable, return the faux DeclStmt
   /// associated with the creation of that condition variable.
   DeclStmt *getConditionVariableDeclStmt() {
-    return hasVarStorage() ? static_cast<DeclStmt *>(
-                                 getTrailingObjects<Stmt *>()[varOffset()])
-                           : nullptr;
+    return hasVarStorage()
+               ? static_cast<DeclStmt *>(getTrailingObjects()[varOffset()])
+               : nullptr;
   }
 
   const DeclStmt *getConditionVariableDeclStmt() const {
-    return hasVarStorage() ? static_cast<DeclStmt *>(
-                                 getTrailingObjects<Stmt *>()[varOffset()])
-                           : nullptr;
+    return hasVarStorage()
+               ? static_cast<DeclStmt *>(getTrailingObjects()[varOffset()])
+               : nullptr;
   }
 
   void setConditionVariableDeclStmt(DeclStmt *CondVar) {
     assert(hasVarStorage());
-    getTrailingObjects<Stmt *>()[varOffset()] = CondVar;
+    getTrailingObjects()[varOffset()] = CondVar;
   }
 
   SwitchCase *getSwitchCaseList() { return FirstCase; }
@@ -2693,15 +2681,13 @@ public:
 
   // Iterators
   child_range children() {
-    return child_range(getTrailingObjects<Stmt *>(),
-                       getTrailingObjects<Stmt *>() +
-                           numTrailingObjects(OverloadToken<Stmt *>()));
+    return child_range(getTrailingObjects(),
+                       getTrailingObjects() + numTrailingStatements());
   }
 
   const_child_range children() const {
-    return const_child_range(getTrailingObjects<Stmt *>(),
-                             getTrailingObjects<Stmt *>() +
-                                 numTrailingObjects(OverloadToken<Stmt *>()));
+    return const_child_range(getTrailingObjects(),
+                             getTrailingObjects() + numTrailingStatements());
   }
 
   static bool classof(const Stmt *T) {
@@ -2738,7 +2724,7 @@ class WhileStmt final : public Stmt,
   unsigned condOffset() const { return VarOffset + hasVarStorage(); }
   unsigned bodyOffset() const { return condOffset() + BodyOffsetFromCond; }
 
-  unsigned numTrailingObjects(OverloadToken<Stmt *>) const {
+  unsigned numTrailingStatements() const {
     return NumMandatoryStmtPtr + hasVarStorage();
   }
 
@@ -2764,25 +2750,21 @@ public:
   bool hasVarStorage() const { return WhileStmtBits.HasVar; }
 
   Expr *getCond() {
-    return reinterpret_cast<Expr *>(getTrailingObjects<Stmt *>()[condOffset()]);
+    return reinterpret_cast<Expr *>(getTrailingObjects()[condOffset()]);
   }
 
   const Expr *getCond() const {
-    return reinterpret_cast<Expr *>(getTrailingObjects<Stmt *>()[condOffset()]);
+    return reinterpret_cast<Expr *>(getTrailingObjects()[condOffset()]);
   }
 
   void setCond(Expr *Cond) {
-    getTrailingObjects<Stmt *>()[condOffset()] = reinterpret_cast<Stmt *>(Cond);
+    getTrailingObjects()[condOffset()] = reinterpret_cast<Stmt *>(Cond);
   }
 
-  Stmt *getBody() { return getTrailingObjects<Stmt *>()[bodyOffset()]; }
-  const Stmt *getBody() const {
-    return getTrailingObjects<Stmt *>()[bodyOffset()];
-  }
+  Stmt *getBody() { return getTrailingObjects()[bodyOffset()]; }
+  const Stmt *getBody() const { return getTrailingObjects()[bodyOffset()]; }
 
-  void setBody(Stmt *Body) {
-    getTrailingObjects<Stmt *>()[bodyOffset()] = Body;
-  }
+  void setBody(Stmt *Body) { getTrailingObjects()[bodyOffset()] = Body; }
 
   /// Retrieve the variable declared in this "while" statement, if any.
   ///
@@ -2804,20 +2786,20 @@ public:
   /// If this WhileStmt has a condition variable, return the faux DeclStmt
   /// associated with the creation of that condition variable.
   DeclStmt *getConditionVariableDeclStmt() {
-    return hasVarStorage() ? static_cast<DeclStmt *>(
-                                 getTrailingObjects<Stmt *>()[varOffset()])
-                           : nullptr;
+    return hasVarStorage()
+               ? static_cast<DeclStmt *>(getTrailingObjects()[varOffset()])
+               : nullptr;
   }
 
   const DeclStmt *getConditionVariableDeclStmt() const {
-    return hasVarStorage() ? static_cast<DeclStmt *>(
-                                 getTrailingObjects<Stmt *>()[varOffset()])
-                           : nullptr;
+    return hasVarStorage()
+               ? static_cast<DeclStmt *>(getTrailingObjects()[varOffset()])
+               : nullptr;
   }
 
   void setConditionVariableDeclStmt(DeclStmt *CondVar) {
     assert(hasVarStorage());
-    getTrailingObjects<Stmt *>()[varOffset()] = CondVar;
+    getTrailingObjects()[varOffset()] = CondVar;
   }
 
   SourceLocation getWhileLoc() const { return WhileStmtBits.WhileLoc; }
@@ -2839,15 +2821,13 @@ public:
 
   // Iterators
   child_range children() {
-    return child_range(getTrailingObjects<Stmt *>(),
-                       getTrailingObjects<Stmt *>() +
-                           numTrailingObjects(OverloadToken<Stmt *>()));
+    return child_range(getTrailingObjects(),
+                       getTrailingObjects() + numTrailingStatements());
   }
 
   const_child_range children() const {
-    return const_child_range(getTrailingObjects<Stmt *>(),
-                             getTrailingObjects<Stmt *>() +
-                                 numTrailingObjects(OverloadToken<Stmt *>()));
+    return const_child_range(getTrailingObjects(),
+                             getTrailingObjects() + numTrailingStatements());
   }
 };
 
@@ -3158,10 +3138,6 @@ class ReturnStmt final
   /// True if this ReturnStmt has storage for an NRVO candidate.
   bool hasNRVOCandidate() const { return ReturnStmtBits.HasNRVOCandidate; }
 
-  unsigned numTrailingObjects(OverloadToken<const VarDecl *>) const {
-    return hasNRVOCandidate();
-  }
-
   /// Build a return statement.
   ReturnStmt(SourceLocation RL, Expr *E, const VarDecl *NRVOCandidate);
 
@@ -3187,8 +3163,7 @@ public:
   /// The optimization itself can only be performed if the variable is
   /// also marked as an NRVO object.
   const VarDecl *getNRVOCandidate() const {
-    return hasNRVOCandidate() ? *getTrailingObjects<const VarDecl *>()
-                              : nullptr;
+    return hasNRVOCandidate() ? *getTrailingObjects() : nullptr;
   }
 
   /// Set the variable that might be used for the named return value
@@ -3197,7 +3172,7 @@ public:
   void setNRVOCandidate(const VarDecl *Var) {
     assert(hasNRVOCandidate() &&
            "This return statement has no storage for an NRVO candidate!");
-    *getTrailingObjects<const VarDecl *>() = Var;
+    *getTrailingObjects() = Var;
   }
 
   SourceLocation getReturnLoc() const { return ReturnStmtBits.RetLoc; }
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 0a9919f..ea3c43f 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -4433,7 +4433,14 @@ destroy the object before returning. The lifetime of the copy of the parameter
 in the caller ends without a destructor call when the call begins.
 
 If a type is trivial for the purpose of calls, it is assumed to be trivially
-relocatable for the purpose of ``__is_trivially_relocatable``.
+relocatable for the purpose of ``__is_trivially_relocatable`` and
+``__builtin_is_cpp_trivially_relocatable``.
+When a type marked with ``[[trivial_abi]]`` is used as a function argument,
+the compiler may omit the call to the copy constructor.
+Thus, side effects of the copy constructor are potentially not performed.
+For example, objects that contain pointers to themselves or otherwise depend
+on their address (or the address or their subobjects) should not be declared
+``[[trivial_abi]]``.
 
 Attribute ``trivial_abi`` has no effect in the following cases:
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 5f44d50..1f283b7 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -1763,27 +1763,39 @@ def err_user_defined_msg_constexpr : Error<
   "constant expression">;
 
 // Type traits explanations
-def note_unsatisfied_trait : Note<"%0 is not %enum_select<TraitName>{"
-                                  "%TriviallyRelocatable{trivially relocatable}|"
-                                  "%TriviallyCopyable{trivially copyable}"
-                                  "}1">;
+def note_unsatisfied_trait
+    : Note<"%0 is not %enum_select<TraitName>{"
+           "%TriviallyRelocatable{trivially relocatable}|"
+           "%Replaceable{replaceable}|"
+           "%TriviallyCopyable{trivially copyable}"
+           "}1">;
 
 def note_unsatisfied_trait_reason
     : Note<"because it "
            "%enum_select<TraitNotSatisfiedReason>{"
            "%Ref{is a reference type}|"
+           "%Const{is const}|"
+           "%Volatile{is volatile}|"
            "%HasArcLifetime{has an ARC lifetime qualifier}|"
            "%VLA{is a variably-modified type}|"
            "%VBase{has a virtual base %1}|"
+           "%NotScalarOrClass{not %select{a|an array of objects of}1 scalar or "
+           "class type}|"
            "%NTRBase{has a non-trivially-relocatable base %1}|"
            "%NTRField{has a non-trivially-relocatable member %1 of type %2}|"
+           "%NonReplaceableBase{has a non-replaceable base %1}|"
+           "%NonReplaceableField{has a non-replaceable member %1 of type %2}|"
            "%NTCBase{has a non-trivially-copyable base %1}|"
            "%NTCField{has a non-trivially-copyable member %1 of type %2}|"
            "%DeletedDtr{has a %select{deleted|user-provided}1 destructor}|"
            "%UserProvidedCtr{has a user provided %select{copy|move}1 "
            "constructor}|"
+           "%DeletedCtr{has a deleted %select{copy|move}1 "
+           "constructor}|"
            "%UserProvidedAssign{has a user provided %select{copy|move}1 "
            "assignment operator}|"
+           "%DeletedAssign{has a deleted %select{copy|move}1 "
+           "assignment operator}|"
            "%UnionWithUserDeclaredSMF{is a union with a user-declared "
            "%sub{select_special_member_kind}1}"
            "}0">;
@@ -9420,6 +9432,8 @@ def err_cuda_host_shared : Error<
     "%select{__device__|__global__|__host__|__host__ __device__}0 functions">;
 def err_cuda_nonstatic_constdev: Error<"__constant__, __device__, and "
     "__managed__ are not allowed on non-static local variables">;
+def err_cuda_address_space_gpuvar: Error<"__constant__, __device__, and "
+    "__shared__ variables must use default address space">;
 def err_cuda_grid_constant_not_allowed : Error<
   "__grid_constant__ is only allowed on const-qualified kernel parameters">;
 def err_cuda_ovl_target : Error<
diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index 7a5f192..a3754f4 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -278,6 +278,11 @@ public:
     return createCast(loc, cir::CastKind::bitcast, src, newTy);
   }
 
+  mlir::Value createPtrBitcast(mlir::Value src, mlir::Type newPointeeTy) {
+    assert(mlir::isa<cir::PointerType>(src.getType()) && "expected ptr src");
+    return createBitcast(src, getPointerTo(newPointeeTy));
+  }
+
   //===--------------------------------------------------------------------===//
   // Binary Operators
   //===--------------------------------------------------------------------===//
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 038a59b..8579f70 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -2275,4 +2275,47 @@ def VecTernaryOp : CIR_Op<"vec.ternary",
   let hasFolder = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// BaseClassAddrOp
+//===----------------------------------------------------------------------===//
+
+def BaseClassAddrOp : CIR_Op<"base_class_addr"> {
+  let summary = "Get the base class address for a class/struct";
+  let description = [{
+    The `cir.base_class_addr` operaration gets the address of a particular
+    non-virtual base class given a derived class pointer. The offset in bytes
+    of the base class must be passed in, since it is easier for the front end
+    to calculate that than the MLIR passes. The operation contains a flag for
+    whether or not the operand may be nullptr. That depends on the context and
+    cannot be known by the operation, and that information affects how the
+    operation is lowered.
+
+    Example:
+    ```c++
+    struct Base { };
+    struct Derived : Base { };
+    Derived d;
+    Base& b = d;
+    ```
+    will generate
+    ```mlir
+    %3 = cir.base_class_addr %1 : !cir.ptr<!rec_Derived> nonnull [0] -> !cir.ptr<!rec_Base>
+    ```
+  }];
+
+  // The validity of the relationship of derived and base cannot yet be
+  // verified, currently not worth adding a verifier.
+  let arguments = (ins
+    Arg<CIR_PointerType, "derived class pointer", [MemRead]>:$derived_addr,
+    IndexAttr:$offset, UnitAttr:$assume_not_null);
+
+  let results = (outs Res<CIR_PointerType, "">:$base_addr);
+
+  let assemblyFormat = [{
+      $derived_addr `:` qualified(type($derived_addr))
+      (`nonnull` $assume_not_null^)?
+      ` ` `[` $offset `]` `->` qualified(type($base_addr)) attr-dict
+  }];
+}
+
 #endif // CLANG_CIR_DIALECT_IR_CIROPS_TD
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 7fcb6cb..72d882b 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -151,6 +151,11 @@ struct MissingFeatures {
   static bool cxxabiAppleARM64CXXABI() { return false; }
   static bool cxxabiStructorImplicitParam() { return false; }
 
+  // Address class
+  static bool addressOffset() { return false; }
+  static bool addressIsKnownNonNull() { return false; }
+  static bool addressPointerAuthInfo() { return false; }
+
   // Misc
   static bool cirgenABIInfo() { return false; }
   static bool abiArgInfo() { return false; }
diff --git a/clang/lib/AST/Stmt.cpp b/clang/lib/AST/Stmt.cpp
index be4ba68..0b0289c 100644
--- a/clang/lib/AST/Stmt.cpp
+++ b/clang/lib/AST/Stmt.cpp
@@ -1154,12 +1154,12 @@ void SwitchStmt::setConditionVariable(const ASTContext &Ctx, VarDecl *V) {
          "This switch statement has no storage for a condition variable!");
 
   if (!V) {
-    getTrailingObjects<Stmt *>()[varOffset()] = nullptr;
+    getTrailingObjects()[varOffset()] = nullptr;
     return;
   }
 
   SourceRange VarRange = V->getSourceRange();
-  getTrailingObjects<Stmt *>()[varOffset()] = new (Ctx)
+  getTrailingObjects()[varOffset()] = new (Ctx)
       DeclStmt(DeclGroupRef(V), VarRange.getBegin(), VarRange.getEnd());
 }
 
@@ -1215,12 +1215,12 @@ void WhileStmt::setConditionVariable(const ASTContext &Ctx, VarDecl *V) {
          "This while statement has no storage for a condition variable!");
 
   if (!V) {
-    getTrailingObjects<Stmt *>()[varOffset()] = nullptr;
+    getTrailingObjects()[varOffset()] = nullptr;
     return;
   }
 
   SourceRange VarRange = V->getSourceRange();
-  getTrailingObjects<Stmt *>()[varOffset()] = new (Ctx)
+  getTrailingObjects()[varOffset()] = new (Ctx)
       DeclStmt(DeclGroupRef(V), VarRange.getBegin(), VarRange.getEnd());
 }
 
diff --git a/clang/lib/Analysis/CFG.cpp b/clang/lib/Analysis/CFG.cpp
index 7f37a8b..cf75959 100644
--- a/clang/lib/Analysis/CFG.cpp
+++ b/clang/lib/Analysis/CFG.cpp
@@ -1261,6 +1261,28 @@ private:
         L2Result.Val.getKind() == APValue::Float) {
       llvm::APFloat L1 = L1Result.Val.getFloat();
       llvm::APFloat L2 = L2Result.Val.getFloat();
+      // Note that L1 and L2 do not necessarily have the same type.  For example
+      // `x != 0 || x != 1.0`, if `x` is a float16, the two literals `0` and
+      // `1.0` are float16 and double respectively.  In this case, we should do
+      // a conversion before comparing L1 and L2.  Their types must be
+      // compatible since they are comparing with the same DRE.
+      int Order = Context->getFloatingTypeSemanticOrder(NumExpr1->getType(),
+                                                        NumExpr2->getType());
+      bool Ignored = false;
+
+      if (Order > 0) {
+        // type rank L1 > L2:
+        if (llvm::APFloat::opOK !=
+            L2.convert(L1.getSemantics(), llvm::APFloat::rmNearestTiesToEven,
+                       &Ignored))
+          return {};
+      } else if (Order < 0)
+        // type rank L1 < L2:
+        if (llvm::APFloat::opOK !=
+            L1.convert(L2.getSemantics(), llvm::APFloat::rmNearestTiesToEven,
+                       &Ignored))
+          return {};
+
       llvm::APFloat MidValue = L1;
       MidValue.add(L2, llvm::APFloat::rmNearestTiesToEven);
       MidValue.divide(llvm::APFloat(MidValue.getSemantics(), "2.0"),
diff --git a/clang/lib/Basic/Targets/LoongArch.h b/clang/lib/Basic/Targets/LoongArch.h
index 11636fa..3ad5abc 100644
--- a/clang/lib/Basic/Targets/LoongArch.h
+++ b/clang/lib/Basic/Targets/LoongArch.h
@@ -49,6 +49,9 @@ public:
     HasFeatureLD_SEQ_SA = false;
     HasFeatureDiv32 = false;
     HasFeatureSCQ = false;
+    BFloat16Width = 16;
+    BFloat16Align = 16;
+    BFloat16Format = &llvm::APFloat::BFloat();
     LongDoubleWidth = 128;
     LongDoubleAlign = 128;
     LongDoubleFormat = &llvm::APFloat::IEEEquad();
@@ -99,6 +102,8 @@ public:
 
   bool hasBitIntType() const override { return true; }
 
+  bool hasBFloat16Type() const override { return true; }
+
   bool useFP16ConversionIntrinsics() const override { return false; }
 
   bool handleTargetFeatures(std::vector<std::string> &Features,
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index 3d58be8..6f8a236 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -647,6 +647,7 @@ public:
   CygwinX86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
       : X86_32TargetInfo(Triple, Opts) {
     this->WCharType = TargetInfo::UnsignedShort;
+    this->WIntType = TargetInfo::UnsignedInt;
     DoubleAlign = LongLongAlign = 64;
     resetDataLayout("e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-"
                     "i128:128-f80:32-n8:16:32-a:0:32-S32",
@@ -982,6 +983,7 @@ public:
   CygwinX86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
       : X86_64TargetInfo(Triple, Opts) {
     this->WCharType = TargetInfo::UnsignedShort;
+    this->WIntType = TargetInfo::UnsignedInt;
   }
 
   void getTargetDefines(const LangOptions &Opts,
diff --git a/clang/lib/CIR/CodeGen/Address.h b/clang/lib/CIR/CodeGen/Address.h
index 2cc8ada..6f76c3e 100644
--- a/clang/lib/CIR/CodeGen/Address.h
+++ b/clang/lib/CIR/CodeGen/Address.h
@@ -21,6 +21,9 @@
 
 namespace clang::CIRGen {
 
+// Forward declaration to avoid a circular dependency
+class CIRGenBuilderTy;
+
 class Address {
 
   // The boolean flag indicates whether the pointer is known to be non-null.
@@ -65,11 +68,22 @@ public:
     return pointerAndKnownNonNull.getPointer() != nullptr;
   }
 
+  /// Return address with different element type, a bitcast pointer, and
+  /// the same alignment.
+  Address withElementType(CIRGenBuilderTy &builder, mlir::Type ElemTy) const;
+
   mlir::Value getPointer() const {
     assert(isValid());
     return pointerAndKnownNonNull.getPointer();
   }
 
+  mlir::Value getBasePointer() const {
+    // TODO(cir): Remove the version above when we catchup with OG codegen on
+    // ptr auth.
+    assert(isValid() && "pointer isn't valid");
+    return getPointer();
+  }
+
   mlir::Type getType() const {
     assert(mlir::cast<cir::PointerType>(
                pointerAndKnownNonNull.getPointer().getType())
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp b/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp
index 5620821..4c8c6ed 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp
@@ -38,3 +38,15 @@ mlir::Value CIRGenBuilderTy::getArrayElement(mlir::Location arrayLocBegin,
   const mlir::Type flatPtrTy = basePtr.getType();
   return create<cir::PtrStrideOp>(arrayLocEnd, flatPtrTy, basePtr, idx);
 }
+
+// This can't be defined in Address.h because that file is included by
+// CIRGenBuilder.h
+Address Address::withElementType(CIRGenBuilderTy &builder,
+                                 mlir::Type elemTy) const {
+  assert(!cir::MissingFeatures::addressOffset());
+  assert(!cir::MissingFeatures::addressIsKnownNonNull());
+  assert(!cir::MissingFeatures::addressPointerAuthInfo());
+
+  return Address(builder.createPtrBitcast(getBasePointer(), elemTy), elemTy,
+                 getAlignment());
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index 5f33ae1..03077ee 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -309,6 +309,18 @@ public:
     return create<cir::BinOp>(loc, cir::BinOpKind::Div, lhs, rhs);
   }
 
+  Address createBaseClassAddr(mlir::Location loc, Address addr,
+                              mlir::Type destType, unsigned offset,
+                              bool assumeNotNull) {
+    if (destType == addr.getElementType())
+      return addr;
+
+    auto ptrTy = getPointerTo(destType);
+    auto baseAddr = create<cir::BaseClassAddrOp>(
+        loc, ptrTy, addr.getPointer(), mlir::APInt(64, offset), assumeNotNull);
+    return Address(baseAddr, destType, addr.getAlignment());
+  }
+
   cir::LoadOp createLoad(mlir::Location loc, Address addr,
                          bool isVolatile = false) {
     mlir::IntegerAttr align = getAlignmentAttr(addr.getAlignment());
diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
new file mode 100644
index 0000000..4cdaa480
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
@@ -0,0 +1,65 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This contains code dealing with C++ code generation of classes
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRGenFunction.h"
+
+#include "clang/AST/RecordLayout.h"
+#include "clang/CIR/MissingFeatures.h"
+
+using namespace clang;
+using namespace clang::CIRGen;
+
+Address CIRGenFunction::getAddressOfBaseClass(
+    Address value, const CXXRecordDecl *derived,
+    llvm::iterator_range<CastExpr::path_const_iterator> path,
+    bool nullCheckValue, SourceLocation loc) {
+  assert(!path.empty() && "Base path should not be empty!");
+
+  if ((*path.begin())->isVirtual()) {
+    // The implementation here is actually complete, but let's flag this
+    // as an error until the rest of the virtual base class support is in place.
+    cgm.errorNYI(loc, "getAddrOfBaseClass: virtual base");
+    return Address::invalid();
+  }
+
+  // Compute the static offset of the ultimate destination within its
+  // allocating subobject (the virtual base, if there is one, or else
+  // the "complete" object that we see).
+  CharUnits nonVirtualOffset =
+      cgm.computeNonVirtualBaseClassOffset(derived, path);
+
+  // Get the base pointer type.
+  mlir::Type baseValueTy = convertType((path.end()[-1])->getType());
+  assert(!cir::MissingFeatures::addressSpace());
+
+  // The if statement here is redundant now, but it will be needed when we add
+  // support for virtual base classes.
+  // If there is no virtual base, use cir.base_class_addr.  It takes care of
+  // the adjustment and the null pointer check.
+  if (nonVirtualOffset.isZero()) {
+    assert(!cir::MissingFeatures::sanitizers());
+    return builder.createBaseClassAddr(getLoc(loc), value, baseValueTy, 0,
+                                       /*assumeNotNull=*/true);
+  }
+
+  assert(!cir::MissingFeatures::sanitizers());
+
+  // Apply the offset
+  value = builder.createBaseClassAddr(getLoc(loc), value, baseValueTy,
+                                      nonVirtualOffset.getQuantity(),
+                                      /*assumeNotNull=*/true);
+
+  // Cast to the destination type.
+  value = value.withElementType(builder, baseValueTy);
+
+  return value;
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 1175fdc..8129fe0 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Address.h"
+#include "CIRGenConstantEmitter.h"
 #include "CIRGenFunction.h"
 #include "CIRGenModule.h"
 #include "CIRGenValue.h"
@@ -97,9 +98,14 @@ Address CIRGenFunction::emitPointerWithAlignment(const Expr *expr,
 
     case CK_UncheckedDerivedToBase:
     case CK_DerivedToBase: {
-      cgm.errorNYI(expr->getSourceRange(),
-                   "emitPointerWithAlignment: derived-to-base cast");
-      return Address::invalid();
+      assert(!cir::MissingFeatures::opTBAA());
+      assert(!cir::MissingFeatures::addressIsKnownNonNull());
+      Address addr = emitPointerWithAlignment(ce->getSubExpr(), baseInfo);
+      const CXXRecordDecl *derived =
+          ce->getSubExpr()->getType()->getPointeeCXXRecordDecl();
+      return getAddressOfBaseClass(addr, derived, ce->path(),
+                                   shouldNullCheckClassCastValue(ce),
+                                   ce->getExprLoc());
     }
 
     case CK_AnyPointerToBlockPointerCast:
@@ -823,8 +829,6 @@ LValue CIRGenFunction::emitCastLValue(const CastExpr *e) {
   case CK_NonAtomicToAtomic:
   case CK_AtomicToNonAtomic:
   case CK_Dynamic:
-  case CK_UncheckedDerivedToBase:
-  case CK_DerivedToBase:
   case CK_ToUnion:
   case CK_BaseToDerived:
   case CK_LValueBitCast:
@@ -863,6 +867,27 @@ LValue CIRGenFunction::emitCastLValue(const CastExpr *e) {
     return lv;
   }
 
+  case CK_UncheckedDerivedToBase:
+  case CK_DerivedToBase: {
+    const auto *derivedClassTy =
+        e->getSubExpr()->getType()->castAs<clang::RecordType>();
+    auto *derivedClassDecl = cast<CXXRecordDecl>(derivedClassTy->getDecl());
+
+    LValue lv = emitLValue(e->getSubExpr());
+    Address thisAddr = lv.getAddress();
+
+    // Perform the derived-to-base conversion
+    Address baseAddr =
+        getAddressOfBaseClass(thisAddr, derivedClassDecl, e->path(),
+                              /*NullCheckValue=*/false, e->getExprLoc());
+
+    // TODO: Support accesses to members of base classes in TBAA. For now, we
+    // conservatively pretend that the complete object is of the base class
+    // type.
+    assert(!cir::MissingFeatures::opTBAA());
+    return makeAddrLValue(baseAddr, e->getType(), lv.getBaseInfo());
+  }
+
   case CK_ZeroToOCLOpaqueType:
     llvm_unreachable("NULL to OpenCL opaque type lvalue cast is not valid");
   }
@@ -1495,3 +1520,57 @@ cir::AllocaOp CIRGenFunction::createTempAlloca(mlir::Type ty,
       emitAlloca(name.str(), ty, loc, CharUnits(), ip, arraySize)
           .getDefiningOp());
 }
+
+/// Try to emit a reference to the given value without producing it as
+/// an l-value.  For many cases, this is just an optimization, but it avoids
+/// us needing to emit global copies of variables if they're named without
+/// triggering a formal use in a context where we can't emit a direct
+/// reference to them, for instance if a block or lambda or a member of a
+/// local class uses a const int variable or constexpr variable from an
+/// enclosing function.
+///
+/// For named members of enums, this is the only way they are emitted.
+CIRGenFunction::ConstantEmission
+CIRGenFunction::tryEmitAsConstant(DeclRefExpr *refExpr) {
+  ValueDecl *value = refExpr->getDecl();
+
+  // There is a lot more to do here, but for now only EnumConstantDecl is
+  // supported.
+  assert(!cir::MissingFeatures::tryEmitAsConstant());
+
+  // The value needs to be an enum constant or a constant variable.
+  if (!isa<EnumConstantDecl>(value))
+    return ConstantEmission();
+
+  Expr::EvalResult result;
+  if (!refExpr->EvaluateAsRValue(result, getContext()))
+    return ConstantEmission();
+
+  QualType resultType = refExpr->getType();
+
+  // As long as we're only handling EnumConstantDecl, there should be no
+  // side-effects.
+  assert(!result.HasSideEffects);
+
+  // Emit as a constant.
+  // FIXME(cir): have emitAbstract build a TypedAttr instead (this requires
+  // somewhat heavy refactoring...)
+  mlir::Attribute c = ConstantEmitter(*this).emitAbstract(
+      refExpr->getLocation(), result.Val, resultType);
+  mlir::TypedAttr cstToEmit = mlir::dyn_cast_if_present<mlir::TypedAttr>(c);
+  assert(cstToEmit && "expected a typed attribute");
+
+  assert(!cir::MissingFeatures::generateDebugInfo());
+
+  return ConstantEmission::forValue(cstToEmit);
+}
+
+mlir::Value CIRGenFunction::emitScalarConstant(
+    const CIRGenFunction::ConstantEmission &constant, Expr *e) {
+  assert(constant && "not a constant");
+  if (constant.isReference()) {
+    cgm.errorNYI(e->getSourceRange(), "emitScalarConstant: reference");
+    return {};
+  }
+  return builder.getConstant(getLoc(e->getSourceRange()), constant.getValue());
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 899622d..481eb49 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -140,7 +140,9 @@ public:
 
   // l-values
   mlir::Value VisitDeclRefExpr(DeclRefExpr *e) {
-    assert(!cir::MissingFeatures::tryEmitAsConstant());
+    if (CIRGenFunction::ConstantEmission constant = cgf.tryEmitAsConstant(e))
+      return cgf.emitScalarConstant(constant, e);
+
     return emitLoadOfLValue(e);
   }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index b008ee9..e32a5c8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -16,6 +16,7 @@
 #include "CIRGenCall.h"
 #include "CIRGenValue.h"
 #include "mlir/IR/Location.h"
+#include "clang/AST/ExprCXX.h"
 #include "clang/AST/GlobalDecl.h"
 #include "clang/CIR/MissingFeatures.h"
 
@@ -629,4 +630,25 @@ void CIRGenFunction::emitNullInitialization(mlir::Location loc, Address destPtr,
   builder.createStore(loc, zeroValue, destPtr);
 }
 
+// TODO(cir): should be shared with LLVM codegen.
+bool CIRGenFunction::shouldNullCheckClassCastValue(const CastExpr *ce) {
+  const Expr *e = ce->getSubExpr();
+
+  if (ce->getCastKind() == CK_UncheckedDerivedToBase)
+    return false;
+
+  if (isa<CXXThisExpr>(e->IgnoreParens())) {
+    // We always assume that 'this' is never null.
+    return false;
+  }
+
+  if (const ImplicitCastExpr *ice = dyn_cast<ImplicitCastExpr>(ce)) {
+    // And that glvalue casts are never null.
+    if (ice->isGLValue())
+      return false;
+  }
+
+  return true;
+}
+
 } // namespace clang::CIRGen
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index ee014ad..d6002c3 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -374,6 +374,41 @@ public:
   /// that we can just remove the code.
   bool containsLabel(const clang::Stmt *s, bool ignoreCaseStmts = false);
 
+  class ConstantEmission {
+    // Cannot use mlir::TypedAttr directly here because of bit availability.
+    llvm::PointerIntPair<mlir::Attribute, 1, bool> valueAndIsReference;
+    ConstantEmission(mlir::TypedAttr c, bool isReference)
+        : valueAndIsReference(c, isReference) {}
+
+  public:
+    ConstantEmission() {}
+    static ConstantEmission forReference(mlir::TypedAttr c) {
+      return ConstantEmission(c, true);
+    }
+    static ConstantEmission forValue(mlir::TypedAttr c) {
+      return ConstantEmission(c, false);
+    }
+
+    explicit operator bool() const {
+      return valueAndIsReference.getOpaqueValue() != nullptr;
+    }
+
+    bool isReference() const { return valueAndIsReference.getInt(); }
+    LValue getReferenceLValue(CIRGenFunction &cgf, Expr *refExpr) const {
+      assert(isReference());
+      cgf.cgm.errorNYI(refExpr->getSourceRange(),
+                       "ConstantEmission::getReferenceLValue");
+      return {};
+    }
+
+    mlir::TypedAttr getValue() const {
+      assert(!isReference());
+      return mlir::cast<mlir::TypedAttr>(valueAndIsReference.getPointer());
+    }
+  };
+
+  ConstantEmission tryEmitAsConstant(DeclRefExpr *refExpr);
+
   struct AutoVarEmission {
     const clang::VarDecl *Variable;
     /// The address of the alloca for languages with explicit address space
@@ -430,6 +465,8 @@ public:
     // TODO: Add symbol table support
   }
 
+  bool shouldNullCheckClassCastValue(const CastExpr *ce);
+
   LValue makeNaturalAlignPointeeAddrLValue(mlir::Value v, clang::QualType t);
 
   /// Construct an address with the natural alignment of T. If a pointer to T
@@ -445,6 +482,11 @@ public:
     return Address(ptr, convertTypeForMem(t), alignment);
   }
 
+  Address getAddressOfBaseClass(
+      Address value, const CXXRecordDecl *derived,
+      llvm::iterator_range<CastExpr::path_const_iterator> path,
+      bool nullCheckValue, SourceLocation loc);
+
   LValue makeAddrLValue(Address addr, QualType ty,
                         AlignmentSource source = AlignmentSource::Type) {
     return makeAddrLValue(addr, ty, LValueBaseInfo(source));
@@ -840,6 +882,8 @@ public:
 
   mlir::LogicalResult emitReturnStmt(const clang::ReturnStmt &s);
 
+  mlir::Value emitScalarConstant(const ConstantEmission &constant, Expr *e);
+
   /// Emit a conversion from the specified type to the specified destination
   /// type, both of which are CIR scalar types.
   mlir::Value emitScalarConversion(mlir::Value src, clang::QualType srcType,
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunctionInfo.h b/clang/lib/CIR/CodeGen/CIRGenFunctionInfo.h
index 9886574..4f5754c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunctionInfo.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunctionInfo.h
@@ -76,10 +76,8 @@ class CIRGenFunctionInfo final
 
   unsigned numArgs;
 
-  CanQualType *getArgTypes() { return getTrailingObjects<CanQualType>(); }
-  const CanQualType *getArgTypes() const {
-    return getTrailingObjects<CanQualType>();
-  }
+  CanQualType *getArgTypes() { return getTrailingObjects(); }
+  const CanQualType *getArgTypes() const { return getTrailingObjects(); }
 
   CIRGenFunctionInfo() : required(RequiredArgs::All) {}
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index e5eae31..3d46c44 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -19,6 +19,7 @@
 #include "clang/AST/DeclBase.h"
 #include "clang/AST/DeclOpenACC.h"
 #include "clang/AST/GlobalDecl.h"
+#include "clang/AST/RecordLayout.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
 #include "clang/CIR/Interfaces/CIROpInterfaces.h"
@@ -1683,6 +1684,33 @@ bool CIRGenModule::verifyModule() const {
   return mlir::verify(theModule).succeeded();
 }
 
+// TODO(cir): this can be shared with LLVM codegen.
+CharUnits CIRGenModule::computeNonVirtualBaseClassOffset(
+    const CXXRecordDecl *derivedClass,
+    llvm::iterator_range<CastExpr::path_const_iterator> path) {
+  CharUnits offset = CharUnits::Zero();
+
+  const ASTContext &astContext = getASTContext();
+  const CXXRecordDecl *rd = derivedClass;
+
+  for (const CXXBaseSpecifier *base : path) {
+    assert(!base->isVirtual() && "Should not see virtual bases here!");
+
+    // Get the layout.
+    const ASTRecordLayout &layout = astContext.getASTRecordLayout(rd);
+
+    const auto *baseDecl = cast<CXXRecordDecl>(
+        base->getType()->castAs<clang::RecordType>()->getDecl());
+
+    // Add the offset.
+    offset += layout.getBaseClassOffset(baseDecl);
+
+    rd = baseDecl;
+  }
+
+  return offset;
+}
+
 DiagnosticBuilder CIRGenModule::errorNYI(SourceLocation loc,
                                          llvm::StringRef feature) {
   unsigned diagID = diags.getCustomDiagID(
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index ac25d52..24ec9ca 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -141,6 +141,10 @@ public:
   getAddrOfGlobalVar(const VarDecl *d, mlir::Type ty = {},
                      ForDefinition_t isForDefinition = NotForDefinition);
 
+  CharUnits computeNonVirtualBaseClassOffset(
+      const CXXRecordDecl *derivedClass,
+      llvm::iterator_range<CastExpr::path_const_iterator> path);
+
   /// Return a constant array for the given string.
   mlir::Attribute getConstantArrayFromStringLiteral(const StringLiteral *e);
 
diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt
index 185a0e1..8bfcd27 100644
--- a/clang/lib/CIR/CodeGen/CMakeLists.txt
+++ b/clang/lib/CIR/CodeGen/CMakeLists.txt
@@ -10,6 +10,7 @@ add_clang_library(clangCIR
   CIRGenerator.cpp
   CIRGenBuilder.cpp
   CIRGenCall.cpp
+  CIRGenClass.cpp
   CIRGenCXXABI.cpp
   CIRGenCXXExpr.cpp
   CIRGenDecl.cpp
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 31ff207..3417a3d 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -671,6 +671,38 @@ mlir::LogicalResult CIRToLLVMPtrStrideOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMBaseClassAddrOpLowering::matchAndRewrite(
+    cir::BaseClassAddrOp baseClassOp, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  const mlir::Type resultType =
+      getTypeConverter()->convertType(baseClassOp.getType());
+  mlir::Value derivedAddr = adaptor.getDerivedAddr();
+  llvm::SmallVector<mlir::LLVM::GEPArg, 1> offset = {
+      adaptor.getOffset().getZExtValue()};
+  mlir::Type byteType = mlir::IntegerType::get(resultType.getContext(), 8,
+                                               mlir::IntegerType::Signless);
+  if (adaptor.getOffset().getZExtValue() == 0) {
+    rewriter.replaceOpWithNewOp<mlir::LLVM::BitcastOp>(
+        baseClassOp, resultType, adaptor.getDerivedAddr());
+    return mlir::success();
+  }
+
+  if (baseClassOp.getAssumeNotNull()) {
+    rewriter.replaceOpWithNewOp<mlir::LLVM::GEPOp>(
+        baseClassOp, resultType, byteType, derivedAddr, offset);
+  } else {
+    auto loc = baseClassOp.getLoc();
+    mlir::Value isNull = rewriter.create<mlir::LLVM::ICmpOp>(
+        loc, mlir::LLVM::ICmpPredicate::eq, derivedAddr,
+        rewriter.create<mlir::LLVM::ZeroOp>(loc, derivedAddr.getType()));
+    mlir::Value adjusted = rewriter.create<mlir::LLVM::GEPOp>(
+        loc, resultType, byteType, derivedAddr, offset);
+    rewriter.replaceOpWithNewOp<mlir::LLVM::SelectOp>(baseClassOp, isNull,
+                                                      derivedAddr, adjusted);
+  }
+  return mlir::success();
+}
+
 mlir::LogicalResult CIRToLLVMAllocaOpLowering::matchAndRewrite(
     cir::AllocaOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
@@ -1750,6 +1782,7 @@ void ConvertCIRToLLVMPass::runOnOperation() {
                                              dl);
   patterns.add<
       // clang-format off
+               CIRToLLVMBaseClassAddrOpLowering,
                CIRToLLVMBinOpLowering,
                CIRToLLVMBrCondOpLowering,
                CIRToLLVMBrOpLowering,
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
index ad2334b..22d8a1e 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
@@ -297,6 +297,16 @@ public:
                   mlir::ConversionPatternRewriter &) const override;
 };
 
+class CIRToLLVMBaseClassAddrOpLowering
+    : public mlir::OpConversionPattern<cir::BaseClassAddrOp> {
+public:
+  using mlir::OpConversionPattern<cir::BaseClassAddrOp>::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cir::BaseClassAddrOp op, OpAdaptor,
+                  mlir::ConversionPatternRewriter &) const override;
+};
+
 class CIRToLLVMStackSaveOpLowering
     : public mlir::OpConversionPattern<cir::StackSaveOp> {
 public:
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index dd26be7..38f5143 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -1280,8 +1280,7 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
     return nullptr;
   }
   if (CGM.getLangOpts().OffloadViaLLVM ||
-      (CGM.getLangOpts().OffloadingNewDriver &&
-       (CGM.getLangOpts().HIP || RelocatableDeviceCode)))
+      (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
     createOffloadingEntries();
   else
     return makeModuleCtorFunction();
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 73ff7757..80728da 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4424,10 +4424,6 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
                    options::OPT_no_offload_new_driver,
                    C.isOffloadingHostKind(Action::OFK_Cuda));
 
-  bool HIPNoRDC =
-      C.isOffloadingHostKind(Action::OFK_HIP) &&
-      !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
-
   // Builder to be used to build offloading actions.
   std::unique_ptr<OffloadingActionBuilder> OffloadBuilder =
       !UseNewOffloadingDriver
@@ -4561,7 +4557,7 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
     // Check if this Linker Job should emit a static library.
     if (ShouldEmitStaticLibrary(Args)) {
       LA = C.MakeAction<StaticLibJobAction>(LinkerInputs, types::TY_Image);
-    } else if ((UseNewOffloadingDriver && !HIPNoRDC) ||
+    } else if (UseNewOffloadingDriver ||
                Args.hasArg(options::OPT_offload_link)) {
       LA = C.MakeAction<LinkerWrapperJobAction>(LinkerInputs, types::TY_Image);
       LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
@@ -4872,28 +4868,10 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
                                        const InputTy &Input, StringRef CUID,
                                        Action *HostAction) const {
   // Don't build offloading actions if explicitly disabled or we do not have a
-  // valid source input.
-  if (offloadHostOnly() || !types::isSrcFile(Input.first))
-    return HostAction;
-
-  bool HIPNoRDC =
-      C.isOffloadingHostKind(Action::OFK_HIP) &&
-      !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
-
-  // For HIP non-rdc non-device-only compilation, create a linker wrapper
-  // action for each host object to link, bundle and wrap device files in
-  // it.
-  if (isa<AssembleJobAction>(HostAction) && HIPNoRDC && !offloadDeviceOnly()) {
-    ActionList AL{HostAction};
-    HostAction = C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_Object);
-    HostAction->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
-                                         /*BoundArch=*/nullptr);
-    return HostAction;
-  }
-
-  // Don't build offloading actions if we do not have a compile action. If
-  // preprocessing only ignore embedding.
-  if (!(isa<CompileJobAction>(HostAction) ||
+  // valid source input and compile action to embed it in. If preprocessing only
+  // ignore embedding.
+  if (offloadHostOnly() || !types::isSrcFile(Input.first) ||
+      !(isa<CompileJobAction>(HostAction) ||
         getFinalPhase(Args) == phases::Preprocess))
     return HostAction;
 
@@ -4989,12 +4967,12 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
       }
     }
 
-    // Compiling HIP in device-only non-RDC mode requires linking each action
-    // individually.
+    // Compiling HIP in non-RDC mode requires linking each action individually.
     for (Action *&A : DeviceActions) {
       if ((A->getType() != types::TY_Object &&
            A->getType() != types::TY_LTO_BC) ||
-          !HIPNoRDC || !offloadDeviceOnly())
+          Kind != Action::OFK_HIP ||
+          Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
         continue;
       ActionList LinkerInput = {A};
       A = C.MakeAction<LinkJobAction>(LinkerInput, types::TY_Image);
@@ -5018,12 +4996,12 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
     }
   }
 
-  // HIP code in device-only non-RDC mode will bundle the output if it invoked
-  // the linker.
+  // HIP code in non-RDC mode will bundle the output if it invoked the linker.
   bool ShouldBundleHIP =
-      HIPNoRDC && offloadDeviceOnly() &&
+      C.isOffloadingHostKind(Action::OFK_HIP) &&
       Args.hasFlag(options::OPT_gpu_bundle_output,
                    options::OPT_no_gpu_bundle_output, true) &&
+      !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false) &&
       !llvm::any_of(OffloadActions,
                     [](Action *A) { return A->getType() != types::TY_Image; });
 
@@ -5043,9 +5021,11 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
         C.MakeAction<LinkJobAction>(OffloadActions, types::TY_CUDA_FATBIN);
     DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_Cuda>(),
              nullptr, Action::OFK_Cuda);
-  } else if (HIPNoRDC && offloadDeviceOnly()) {
-    // If we are in device-only non-RDC-mode we just emit the final HIP
-    // fatbinary for each translation unit, linking each input individually.
+  } else if (C.isOffloadingHostKind(Action::OFK_HIP) &&
+             !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
+                           false)) {
+    // If we are not in RDC-mode we just emit the final HIP fatbinary for each
+    // translation unit, linking each input individually.
     Action *FatbinAction =
         C.MakeAction<LinkJobAction>(OffloadActions, types::TY_HIP_FATBIN);
     DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_HIP>(),
@@ -5198,11 +5178,8 @@ Action *Driver::ConstructPhaseAction(
         (((Input->getOffloadingToolChain() &&
            Input->getOffloadingToolChain()->getTriple().isAMDGPU()) ||
           TargetDeviceOffloadKind == Action::OFK_HIP) &&
-         ((Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
-                        false) ||
-           (Args.hasFlag(options::OPT_offload_new_driver,
-                         options::OPT_no_offload_new_driver, false) &&
-            !offloadDeviceOnly())) ||
+         (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
+                       false) ||
           TargetDeviceOffloadKind == Action::OFK_OpenMP))) {
       types::ID Output =
           Args.hasArg(options::OPT_S) &&
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 65f101d..d85cc41 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7821,7 +7821,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-fcuda-include-gpubinary");
     CmdArgs.push_back(CudaDeviceInput->getFilename());
   } else if (!HostOffloadingInputs.empty()) {
-    if (IsCuda && !IsRDCMode) {
+    if ((IsCuda || IsHIP) && !IsRDCMode) {
       assert(HostOffloadingInputs.size() == 1 && "Only one input expected");
       CmdArgs.push_back("-fcuda-include-gpubinary");
       CmdArgs.push_back(HostOffloadingInputs.front().getFilename());
@@ -9368,20 +9368,8 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
   // Add the linker arguments to be forwarded by the wrapper.
   CmdArgs.push_back(Args.MakeArgString(Twine("--linker-path=") +
                                        LinkCommand->getExecutable()));
-
-  // We use action type to differentiate two use cases of the linker wrapper.
-  // TY_Image for normal linker wrapper work.
-  // TY_Object for HIP fno-gpu-rdc embedding device binary in a relocatable
-  // object.
-  assert(JA.getType() == types::TY_Object || JA.getType() == types::TY_Image);
-  if (JA.getType() == types::TY_Object) {
-    CmdArgs.append({"-o", Output.getFilename()});
-    for (auto Input : Inputs)
-      CmdArgs.push_back(Input.getFilename());
-    CmdArgs.push_back("-r");
-  } else
-    for (const char *LinkArg : LinkCommand->getArguments())
-      CmdArgs.push_back(LinkArg);
+  for (const char *LinkArg : LinkCommand->getArguments())
+    CmdArgs.push_back(LinkArg);
 
   addOffloadCompressArgs(Args, CmdArgs);
 
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index 77d857b..59f423b 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -1642,14 +1642,16 @@ void DarwinClang::AddLinkRuntimeLibArgs(const ArgList &Args,
     CmdArgs.push_back("-lSystem");
 
   // Select the dynamic runtime library and the target specific static library.
-  if (isTargetIOSBased()) {
-    // If we are compiling as iOS / simulator, don't attempt to link libgcc_s.1,
-    // it never went into the SDK.
-    // Linking against libgcc_s.1 isn't needed for iOS 5.0+
-    if (isIPhoneOSVersionLT(5, 0) && !isTargetIOSSimulator() &&
-        getTriple().getArch() != llvm::Triple::aarch64)
-      CmdArgs.push_back("-lgcc_s.1");
-  }
+  // Some old Darwin versions put builtins, libunwind, and some other stuff in
+  // libgcc_s.1.dylib. MacOS X 10.6 and iOS 5 moved those functions to
+  // libSystem, and made libgcc_s.1.dylib a stub. We never link libgcc_s when
+  // building for aarch64 or iOS simulator, since libgcc_s was made obsolete
+  // before either existed.
+  if (getTriple().getArch() != llvm::Triple::aarch64 &&
+      ((isTargetIOSBased() && isIPhoneOSVersionLT(5, 0) &&
+        !isTargetIOSSimulator()) ||
+       (isTargetMacOSBased() && isMacosxVersionLT(10, 6))))
+    CmdArgs.push_back("-lgcc_s.1");
   AddLinkRuntimeLib(Args, CmdArgs, "builtins");
 }
 
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index 4e4e48f9..424b6db 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -2248,7 +2248,6 @@ unsigned ContinuationIndenter::reformatRawStringLiteral(
       /*Status=*/nullptr);
 
   auto NewCode = applyAllReplacements(RawText, Fixes.first);
-  tooling::Replacements NoFixes;
   if (!NewCode)
     return addMultilineToken(Current, State);
   if (!DryRun) {
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 4acfe0c..61b8412 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -269,7 +269,7 @@ void UnwrappedLineParser::parseFile() {
   bool MustBeDeclaration = !Line->InPPDirective && !Style.isJavaScript();
   ScopedDeclarationState DeclarationState(*Line, DeclarationScopeStack,
                                           MustBeDeclaration);
-  if (Style.isTextProto())
+  if (Style.isTextProto() || (Style.isJson() && FormatTok->IsFirst))
     parseBracedList();
   else
     parseLevel();
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index 5ca47ad..24cb8c3 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -321,7 +321,7 @@ void SemaCUDA::EraseUnwantedMatches(
   if (Matches.size() <= 1)
     return;
 
-  using Pair = std::pair<DeclAccessPair, FunctionDecl*>;
+  using Pair = std::pair<DeclAccessPair, FunctionDecl *>;
 
   // Gets the CUDA function preference for a call from Caller to Match.
   auto GetCFP = [&](const Pair &Match) {
@@ -504,7 +504,6 @@ bool SemaCUDA::inferTargetForImplicitSpecialMember(CXXRecordDecl *ClassDecl,
     }
   }
 
-
   // If no target was inferred, mark this member as __host__ __device__;
   // it's the least restrictive option that can be invoked from any target.
   bool NeedsH = true, NeedsD = true;
@@ -679,16 +678,22 @@ void SemaCUDA::checkAllowedInitializer(VarDecl *VD) {
       FD && FD->isDependentContext())
     return;
 
+  bool IsSharedVar = VD->hasAttr<CUDASharedAttr>();
+  bool IsDeviceOrConstantVar =
+      !IsSharedVar &&
+      (VD->hasAttr<CUDADeviceAttr>() || VD->hasAttr<CUDAConstantAttr>());
+  if ((IsSharedVar || IsDeviceOrConstantVar) &&
+      VD->getType().getQualifiers().getAddressSpace() != LangAS::Default) {
+    Diag(VD->getLocation(), diag::err_cuda_address_space_gpuvar);
+    VD->setInvalidDecl();
+    return;
+  }
   // Do not check dependent variables since the ctor/dtor/initializer are not
   // determined. Do it after instantiation.
   if (VD->isInvalidDecl() || !VD->hasInit() || !VD->hasGlobalStorage() ||
       IsDependentVar(VD))
     return;
   const Expr *Init = VD->getInit();
-  bool IsSharedVar = VD->hasAttr<CUDASharedAttr>();
-  bool IsDeviceOrConstantVar =
-      !IsSharedVar &&
-      (VD->hasAttr<CUDADeviceAttr>() || VD->hasAttr<CUDAConstantAttr>());
   if (IsDeviceOrConstantVar || IsSharedVar) {
     if (HasAllowedCUDADeviceStaticInitializer(
             *this, VD, IsSharedVar ? CICK_Shared : CICK_DeviceOrConstant))
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 5d870f07..da56225 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -6620,7 +6620,7 @@ void InitializationSequence::InitializeFrom(Sema &S,
       // initializer present. However, we only do this for structure types, not
       // union types, because an unitialized field in a union is generally
       // reasonable, especially in C where unions can be used for type punning.
-      if (!Initializer && !Rec->isUnion() && !Rec->isInvalidDecl()) {
+      if (Var && !Initializer && !Rec->isUnion() && !Rec->isInvalidDecl()) {
         if (const FieldDecl *FD = getConstField(Rec)) {
           unsigned DiagID = diag::warn_default_init_const_field_unsafe;
           if (Var->getStorageDuration() == SD_Static ||
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 66f84fc..5a19dac 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -1490,7 +1490,7 @@ static bool IsOverloadOrOverrideImpl(Sema &SemaRef, FunctionDecl *New,
   // If the function is a class member, its signature includes the
   // cv-qualifiers (if any) and ref-qualifier (if any) on the function itself.
   auto DiagnoseInconsistentRefQualifiers = [&]() {
-    if (SemaRef.LangOpts.CPlusPlus23)
+    if (SemaRef.LangOpts.CPlusPlus23 && !UseOverrideRules)
       return false;
     if (OldMethod->getRefQualifier() == NewMethod->getRefQualifier())
       return false;
diff --git a/clang/lib/Sema/SemaTypeTraits.cpp b/clang/lib/Sema/SemaTypeTraits.cpp
index 330f2aa..cd05765 100644
--- a/clang/lib/Sema/SemaTypeTraits.cpp
+++ b/clang/lib/Sema/SemaTypeTraits.cpp
@@ -104,6 +104,7 @@ static CXXMethodDecl *LookupSpecialMemberFromXValue(Sema &SemaRef,
   OverloadCandidateSet::iterator Best;
   switch (OCS.BestViableFunction(SemaRef, LookupLoc, Best)) {
   case OR_Success:
+  case OR_Deleted:
     return cast<CXXMethodDecl>(Best->Function);
   default:
     return nullptr;
@@ -120,7 +121,8 @@ static bool hasSuitableConstructorForRelocation(Sema &SemaRef,
 
   CXXMethodDecl *Decl =
       LookupSpecialMemberFromXValue(SemaRef, D, /*Assign=*/false);
-  return Decl && Decl->isUserProvided() == AllowUserDefined;
+  return Decl && Decl->isUserProvided() == AllowUserDefined &&
+         !Decl->isDeleted();
 }
 
 static bool hasSuitableMoveAssignmentOperatorForRelocation(
@@ -135,7 +137,8 @@ static bool hasSuitableMoveAssignmentOperatorForRelocation(
   if (!Decl)
     return false;
 
-  return Decl && Decl->isUserProvided() == AllowUserDefined;
+  return Decl && Decl->isUserProvided() == AllowUserDefined &&
+         !Decl->isDeleted();
 }
 
 // [C++26][class.prop]
@@ -1940,6 +1943,7 @@ static std::optional<TypeTrait> StdNameToTypeTrait(StringRef Name) {
   return llvm::StringSwitch<std::optional<TypeTrait>>(Name)
       .Case("is_trivially_relocatable",
             TypeTrait::UTT_IsCppTriviallyRelocatable)
+      .Case("is_replaceable", TypeTrait::UTT_IsReplaceable)
       .Case("is_trivially_copyable", TypeTrait::UTT_IsTriviallyCopyable)
       .Default(std::nullopt);
 }
@@ -2005,35 +2009,8 @@ static ExtractedTypeTraitInfo ExtractTypeTraitFromExpression(const Expr *E) {
   return std::nullopt;
 }
 
-static void DiagnoseNonTriviallyRelocatableReason(Sema &SemaRef,
-                                                  SourceLocation Loc,
-                                                  const CXXRecordDecl *D) {
-  for (const CXXBaseSpecifier &B : D->bases()) {
-    assert(B.getType()->getAsCXXRecordDecl() && "invalid base?");
-    if (B.isVirtual())
-      SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
-          << diag::TraitNotSatisfiedReason::VBase << B.getType()
-          << B.getSourceRange();
-    if (!SemaRef.IsCXXTriviallyRelocatableType(B.getType()))
-      SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
-          << diag::TraitNotSatisfiedReason::NTRBase << B.getType()
-          << B.getSourceRange();
-  }
-  for (const FieldDecl *Field : D->fields()) {
-    if (!Field->getType()->isReferenceType() &&
-        !SemaRef.IsCXXTriviallyRelocatableType(Field->getType()))
-      SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
-          << diag::TraitNotSatisfiedReason::NTRField << Field
-          << Field->getType() << Field->getSourceRange();
-  }
-  if (D->hasDeletedDestructor())
-    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
-        << diag::TraitNotSatisfiedReason::DeletedDtr << /*Deleted*/ 0
-        << D->getDestructor()->getSourceRange();
-
-  if (D->hasAttr<TriviallyRelocatableAttr>())
-    return;
-
+static void DiagnoseNonDefaultMovable(Sema &SemaRef, SourceLocation Loc,
+                                      const CXXRecordDecl *D) {
   if (D->isUnion()) {
     auto DiagSPM = [&](CXXSpecialMemberKind K, bool Has) {
       if (Has)
@@ -2052,7 +2029,7 @@ static void DiagnoseNonTriviallyRelocatableReason(Sema &SemaRef,
   }
 
   if (!D->hasSimpleMoveConstructor() && !D->hasSimpleCopyConstructor()) {
-    const auto *Decl = cast<CXXConstructorDecl>(
+    const auto *Decl = cast_or_null<CXXConstructorDecl>(
         LookupSpecialMemberFromXValue(SemaRef, D, /*Assign=*/false));
     if (Decl && Decl->isUserProvided())
       SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
@@ -2076,6 +2053,37 @@ static void DiagnoseNonTriviallyRelocatableReason(Sema &SemaRef,
 
 static void DiagnoseNonTriviallyRelocatableReason(Sema &SemaRef,
                                                   SourceLocation Loc,
+                                                  const CXXRecordDecl *D) {
+  for (const CXXBaseSpecifier &B : D->bases()) {
+    assert(B.getType()->getAsCXXRecordDecl() && "invalid base?");
+    if (B.isVirtual())
+      SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+          << diag::TraitNotSatisfiedReason::VBase << B.getType()
+          << B.getSourceRange();
+    if (!SemaRef.IsCXXTriviallyRelocatableType(B.getType()))
+      SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+          << diag::TraitNotSatisfiedReason::NTRBase << B.getType()
+          << B.getSourceRange();
+  }
+  for (const FieldDecl *Field : D->fields()) {
+    if (!Field->getType()->isReferenceType() &&
+        !SemaRef.IsCXXTriviallyRelocatableType(Field->getType()))
+      SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+          << diag::TraitNotSatisfiedReason::NTRField << Field
+          << Field->getType() << Field->getSourceRange();
+  }
+  if (D->hasDeletedDestructor())
+    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+        << diag::TraitNotSatisfiedReason::DeletedDtr << /*Deleted*/ 0
+        << D->getDestructor()->getSourceRange();
+
+  if (D->hasAttr<TriviallyRelocatableAttr>())
+    return;
+  DiagnoseNonDefaultMovable(SemaRef, Loc, D);
+}
+
+static void DiagnoseNonTriviallyRelocatableReason(Sema &SemaRef,
+                                                  SourceLocation Loc,
                                                   QualType T) {
   SemaRef.Diag(Loc, diag::note_unsatisfied_trait)
       << T << diag::TraitName::TriviallyRelocatable;
@@ -2102,6 +2110,92 @@ static void DiagnoseNonTriviallyRelocatableReason(Sema &SemaRef,
   SemaRef.Diag(D->getLocation(), diag::note_defined_here) << D;
 }
 
+static void DiagnoseNonReplaceableReason(Sema &SemaRef, SourceLocation Loc,
+                                         const CXXRecordDecl *D) {
+  for (const CXXBaseSpecifier &B : D->bases()) {
+    assert(B.getType()->getAsCXXRecordDecl() && "invalid base?");
+    if (!SemaRef.IsCXXReplaceableType(B.getType()))
+      SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+          << diag::TraitNotSatisfiedReason::NonReplaceableBase << B.getType()
+          << B.getSourceRange();
+  }
+  for (const FieldDecl *Field : D->fields()) {
+    if (!SemaRef.IsCXXReplaceableType(Field->getType()))
+      SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+          << diag::TraitNotSatisfiedReason::NonReplaceableField << Field
+          << Field->getType() << Field->getSourceRange();
+  }
+  if (D->hasDeletedDestructor())
+    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+        << diag::TraitNotSatisfiedReason::DeletedDtr << /*Deleted*/ 0
+        << D->getDestructor()->getSourceRange();
+
+  if (!D->hasSimpleMoveConstructor() && !D->hasSimpleCopyConstructor()) {
+    const auto *Decl = cast<CXXConstructorDecl>(
+        LookupSpecialMemberFromXValue(SemaRef, D, /*Assign=*/false));
+    if (Decl && Decl->isDeleted())
+      SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+          << diag::TraitNotSatisfiedReason::DeletedCtr
+          << Decl->isMoveConstructor() << Decl->getSourceRange();
+  }
+  if (!D->hasSimpleMoveAssignment() && !D->hasSimpleCopyAssignment()) {
+    CXXMethodDecl *Decl =
+        LookupSpecialMemberFromXValue(SemaRef, D, /*Assign=*/true);
+    if (Decl && Decl->isDeleted())
+      SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+          << diag::TraitNotSatisfiedReason::DeletedAssign
+          << Decl->isMoveAssignmentOperator() << Decl->getSourceRange();
+  }
+
+  if (D->hasAttr<ReplaceableAttr>())
+    return;
+  DiagnoseNonDefaultMovable(SemaRef, Loc, D);
+}
+
+static void DiagnoseNonReplaceableReason(Sema &SemaRef, SourceLocation Loc,
+                                         QualType T) {
+  SemaRef.Diag(Loc, diag::note_unsatisfied_trait)
+      << T << diag::TraitName::Replaceable;
+
+  if (T->isVariablyModifiedType())
+    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+        << diag::TraitNotSatisfiedReason::VLA;
+
+  if (T->isReferenceType())
+    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+        << diag::TraitNotSatisfiedReason::Ref;
+  T = T.getNonReferenceType();
+
+  if (T.isConstQualified())
+    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+        << diag::TraitNotSatisfiedReason::Const;
+
+  if (T.isVolatileQualified())
+    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+        << diag::TraitNotSatisfiedReason::Volatile;
+
+  bool IsArray = T->isArrayType();
+  T = SemaRef.getASTContext().getBaseElementType(T.getUnqualifiedType());
+
+  if (T->isScalarType())
+    return;
+
+  const CXXRecordDecl *D = T->getAsCXXRecordDecl();
+  if (!D) {
+    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+        << diag::TraitNotSatisfiedReason::NotScalarOrClass << IsArray;
+    return;
+  }
+
+  if (D->isInvalidDecl())
+    return;
+
+  if (D->hasDefinition())
+    DiagnoseNonReplaceableReason(SemaRef, Loc, D);
+
+  SemaRef.Diag(D->getLocation(), diag::note_defined_here) << D;
+}
+
 static void DiagnoseNonTriviallyCopyableReason(Sema &SemaRef,
                                                SourceLocation Loc,
                                                const CXXRecordDecl *D) {
@@ -2192,6 +2286,9 @@ void Sema::DiagnoseTypeTraitDetails(const Expr *E) {
   case UTT_IsCppTriviallyRelocatable:
     DiagnoseNonTriviallyRelocatableReason(*this, E->getBeginLoc(), Args[0]);
     break;
+  case UTT_IsReplaceable:
+    DiagnoseNonReplaceableReason(*this, E->getBeginLoc(), Args[0]);
+    break;
   case UTT_IsTriviallyCopyable:
     DiagnoseNonTriviallyCopyableReason(*this, E->getBeginLoc(), Args[0]);
     break;
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
index 81fca5d..cfcc47c 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
@@ -236,6 +236,26 @@ bool isConstOwnerPtrMemberExpr(const clang::Expr *E) {
   return isOwnerPtrType(T) && T.isConstQualified();
 }
 
+bool isExprToGetCheckedPtrCapableMember(const clang::Expr *E) {
+  auto *ME = dyn_cast<MemberExpr>(E);
+  if (!ME)
+    return false;
+  auto *Base = ME->getBase();
+  if (!Base)
+    return false;
+  if (!isa<CXXThisExpr>(Base->IgnoreParenCasts()))
+    return false;
+  auto *D = ME->getMemberDecl();
+  if (!D)
+    return false;
+  auto T = D->getType();
+  auto *CXXRD = T->getAsCXXRecordDecl();
+  if (!CXXRD)
+    return false;
+  auto result = isCheckedPtrCapable(CXXRD);
+  return result && *result;
+}
+
 class EnsureFunctionVisitor
     : public ConstStmtVisitor<EnsureFunctionVisitor, bool> {
 public:
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h
index e2cc7b9..8302bbe 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h
@@ -69,6 +69,10 @@ bool isASafeCallArg(const clang::Expr *E);
 /// \returns true if E is a MemberExpr accessing a const smart pointer type.
 bool isConstOwnerPtrMemberExpr(const clang::Expr *E);
 
+/// \returns true if E is a MemberExpr accessing a member variable which
+/// supports CheckedPtr.
+bool isExprToGetCheckedPtrCapableMember(const clang::Expr *E);
+
 /// \returns true if E is a CXXMemberCallExpr which returns a const smart
 /// pointer type.
 class EnsureFunctionAnalysis {
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index 4ddd1149..72199af 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -236,6 +236,7 @@ std::optional<bool> isUnchecked(const QualType T) {
 void RetainTypeChecker::visitTranslationUnitDecl(
     const TranslationUnitDecl *TUD) {
   IsARCEnabled = TUD->getLangOpts().ObjCAutoRefCount;
+  DefaultSynthProperties = TUD->getLangOpts().ObjCDefaultSynthProperties;
 }
 
 void RetainTypeChecker::visitTypedef(const TypedefDecl *TD) {
@@ -468,6 +469,18 @@ bool isPtrConversion(const FunctionDecl *F) {
       FunctionName == "checked_objc_cast")
     return true;
 
+  auto ReturnType = F->getReturnType();
+  if (auto *Type = ReturnType.getTypePtrOrNull()) {
+    if (auto *AttrType = dyn_cast<AttributedType>(Type)) {
+      if (auto *Attr = AttrType->getAttr()) {
+        if (auto *AnnotateType = dyn_cast<AnnotateTypeAttr>(Attr)) {
+          if (AnnotateType->getAnnotation() == "webkit.pointerconversion")
+            return true;
+        }
+      }
+    }
+  }
+
   return false;
 }
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
index f9fcfe9..3c9560c 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
@@ -76,12 +76,14 @@ class RetainTypeChecker {
   llvm::DenseSet<const RecordType *> CFPointees;
   llvm::DenseSet<const Type *> RecordlessTypes;
   bool IsARCEnabled{false};
+  bool DefaultSynthProperties{true};
 
 public:
   void visitTranslationUnitDecl(const TranslationUnitDecl *);
   void visitTypedef(const TypedefDecl *);
   bool isUnretained(const QualType, bool ignoreARC = false);
   bool isARCEnabled() const { return IsARCEnabled; }
+  bool defaultSynthProperties() const { return DefaultSynthProperties; }
 };
 
 /// \returns true if \p Class is NS or CF objects AND not retained, false if
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp
index 78675e5..6bc39ab 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp
@@ -443,6 +443,10 @@ public:
     return isRefOrCheckedPtrType(type);
   }
 
+  bool isSafeExpr(const Expr *E) const final {
+    return isExprToGetCheckedPtrCapableMember(E);
+  }
+
   const char *ptrKind() const final { return "unchecked"; }
 };
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp
index 4fb4770..7cd86a6 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp
@@ -417,6 +417,9 @@ public:
   bool isSafePtrType(const QualType type) const final {
     return isRefOrCheckedPtrType(type);
   }
+  bool isSafeExpr(const Expr *E) const final {
+    return isExprToGetCheckedPtrCapableMember(E);
+  }
   const char *ptrKind() const final { return "unchecked"; }
 };
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
index b1350b9..8faf6a2 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
@@ -28,6 +28,7 @@ class RawPtrRefMemberChecker
 private:
   BugType Bug;
   mutable BugReporter *BR;
+  mutable llvm::DenseSet<const ObjCIvarDecl *> IvarDeclsToIgnore;
 
 protected:
   mutable std::optional<RetainTypeChecker> RTC;
@@ -36,7 +37,8 @@ public:
   RawPtrRefMemberChecker(const char *description)
       : Bug(this, description, "WebKit coding guidelines") {}
 
-  virtual std::optional<bool> isUnsafePtr(QualType) const = 0;
+  virtual std::optional<bool> isUnsafePtr(QualType,
+                                          bool ignoreARC = false) const = 0;
   virtual const char *typeName() const = 0;
   virtual const char *invariant() const = 0;
 
@@ -138,6 +140,8 @@ public:
       return;
     }
     if (auto *ID = dyn_cast<ObjCImplementationDecl>(CD)) {
+      for (auto *PropImpl : ID->property_impls())
+        visitPropImpl(CD, PropImpl);
       for (auto *Ivar : ID->ivars())
         visitIvarDecl(CD, Ivar);
       return;
@@ -148,6 +152,10 @@ public:
                      const ObjCIvarDecl *Ivar) const {
     if (BR->getSourceManager().isInSystemHeader(Ivar->getLocation()))
       return;
+
+    if (IvarDeclsToIgnore.contains(Ivar))
+      return;
+
     auto QT = Ivar->getType();
     const Type *IvarType = QT.getTypePtrOrNull();
     if (!IvarType)
@@ -157,6 +165,8 @@ public:
     if (!IsUnsafePtr || !*IsUnsafePtr)
       return;
 
+    IvarDeclsToIgnore.insert(Ivar);
+
     if (auto *MemberCXXRD = IvarType->getPointeeCXXRecordDecl())
       reportBug(Ivar, IvarType, MemberCXXRD, CD);
     else if (auto *ObjCDecl = getObjCDecl(IvarType))
@@ -167,13 +177,15 @@ public:
                              const ObjCPropertyDecl *PD) const {
     if (BR->getSourceManager().isInSystemHeader(PD->getLocation()))
       return;
-    auto QT = PD->getType();
-    const Type *PropType = QT.getTypePtrOrNull();
-    if (!PropType)
-      return;
 
-    auto IsUnsafePtr = isUnsafePtr(QT);
-    if (!IsUnsafePtr || !*IsUnsafePtr)
+    if (const ObjCInterfaceDecl *ID = dyn_cast<ObjCInterfaceDecl>(CD)) {
+      if (!RTC || !RTC->defaultSynthProperties() ||
+          ID->isObjCRequiresPropertyDefs())
+        return;
+    }
+
+    auto [IsUnsafe, PropType] = isPropImplUnsafePtr(PD);
+    if (!IsUnsafe)
       return;
 
     if (auto *MemberCXXRD = PropType->getPointeeCXXRecordDecl())
@@ -182,6 +194,47 @@ public:
       reportBug(PD, PropType, ObjCDecl, CD);
   }
 
+  void visitPropImpl(const ObjCContainerDecl *CD,
+                     const ObjCPropertyImplDecl *PID) const {
+    if (BR->getSourceManager().isInSystemHeader(PID->getLocation()))
+      return;
+
+    if (PID->getPropertyImplementation() != ObjCPropertyImplDecl::Synthesize)
+      return;
+
+    auto *PropDecl = PID->getPropertyDecl();
+    if (auto *IvarDecl = PID->getPropertyIvarDecl()) {
+      if (IvarDeclsToIgnore.contains(IvarDecl))
+        return;
+      IvarDeclsToIgnore.insert(IvarDecl);
+    }
+    auto [IsUnsafe, PropType] = isPropImplUnsafePtr(PropDecl);
+    if (!IsUnsafe)
+      return;
+
+    if (auto *MemberCXXRD = PropType->getPointeeCXXRecordDecl())
+      reportBug(PropDecl, PropType, MemberCXXRD, CD);
+    else if (auto *ObjCDecl = getObjCDecl(PropType))
+      reportBug(PropDecl, PropType, ObjCDecl, CD);
+  }
+
+  std::pair<bool, const Type *>
+  isPropImplUnsafePtr(const ObjCPropertyDecl *PD) const {
+    if (!PD)
+      return {false, nullptr};
+
+    auto QT = PD->getType();
+    const Type *PropType = QT.getTypePtrOrNull();
+    if (!PropType)
+      return {false, nullptr};
+
+    // "assign" property doesn't retain even under ARC so treat it as unsafe.
+    bool ignoreARC =
+        !PD->isReadOnly() && PD->getSetterKind() == ObjCPropertyDecl::Assign;
+    auto IsUnsafePtr = isUnsafePtr(QT, ignoreARC);
+    return {IsUnsafePtr && *IsUnsafePtr, PropType};
+  }
+
   bool shouldSkipDecl(const RecordDecl *RD) const {
     if (!RD->isThisDeclarationADefinition())
       return true;
@@ -272,7 +325,7 @@ public:
       : RawPtrRefMemberChecker("Member variable is a raw-pointer/reference to "
                                "reference-countable type") {}
 
-  std::optional<bool> isUnsafePtr(QualType QT) const final {
+  std::optional<bool> isUnsafePtr(QualType QT, bool) const final {
     return isUncountedPtr(QT.getCanonicalType());
   }
 
@@ -289,7 +342,7 @@ public:
       : RawPtrRefMemberChecker("Member variable is a raw-pointer/reference to "
                                "checked-pointer capable type") {}
 
-  std::optional<bool> isUnsafePtr(QualType QT) const final {
+  std::optional<bool> isUnsafePtr(QualType QT, bool) const final {
     return isUncheckedPtr(QT.getCanonicalType());
   }
 
@@ -309,8 +362,8 @@ public:
     RTC = RetainTypeChecker();
   }
 
-  std::optional<bool> isUnsafePtr(QualType QT) const final {
-    return RTC->isUnretained(QT);
+  std::optional<bool> isUnsafePtr(QualType QT, bool ignoreARC) const final {
+    return RTC->isUnretained(QT, ignoreARC);
   }
 
   const char *typeName() const final { return "retainable type"; }
diff --git a/clang/test/Analysis/Checkers/WebKit/call-args-checked.cpp b/clang/test/Analysis/Checkers/WebKit/call-args-checked.cpp
index c938ba5..b2fb042 100644
--- a/clang/test/Analysis/Checkers/WebKit/call-args-checked.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/call-args-checked.cpp
@@ -46,6 +46,26 @@ static void baz() {
 
 } // namespace call_args_checked
 
+namespace call_args_member {
+
+void consume(CheckedObj&);
+
+struct WrapperObj {
+  CheckedObj checked;
+  CheckedObj& checkedRef;
+  void foo() {
+    consume(checked);
+    consume(checkedRef);
+    // expected-warning@-1{{Call argument is unchecked and unsafe [alpha.webkit.UncheckedCallArgsChecker]}}
+  }
+  void bar(WrapperObj& other) {
+    consume(other.checked);
+    // expected-warning@-1{{Call argument is unchecked and unsafe [alpha.webkit.UncheckedCallArgsChecker]}}
+  }
+};
+
+} // namespace call_args_checked
+
 namespace call_args_default {
 
 void someFunction(RefCountableAndCheckable* = makeObj());
diff --git a/clang/test/Analysis/Checkers/WebKit/call-args-safe-functions.cpp b/clang/test/Analysis/Checkers/WebKit/call-args-safe-functions.cpp
index a874465..5c540a5 100644
--- a/clang/test/Analysis/Checkers/WebKit/call-args-safe-functions.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/call-args-safe-functions.cpp
@@ -1,10 +1,12 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s
-// expected-no-diagnostics
+
+#include "mock-types.h"
 
 class Base {
 public:
-    inline void ref();
-    inline void deref();
+    void ref();
+    void deref();
+    void doWork();
 };
 
 class Derived : public Base {
@@ -21,6 +23,7 @@ class SubDerived final : public Derived {
 class OtherObject {
 public:
     Derived* obj();
+    Base* base();
 };
 
 class String {
@@ -44,6 +47,12 @@ inline Target* uncheckedDowncast(Source* source)
     return static_cast<Target*>(source);
 }
 
+template<typename Target, typename Source>
+Target* [[clang::annotate_type("webkit.pointerconversion")]] newCastFunction(Source*);
+
+template<typename Target, typename Source>
+Target* [[clang::annotate_type("unrelated-annotation")]] badCastFunction(Source*);
+
 template<typename... Types>
 String toString(const Types&... values);
 
@@ -52,5 +61,17 @@ void foo(OtherObject* other)
     dynamicDowncast<SubDerived>(other->obj());
     checkedDowncast<SubDerived>(other->obj());
     uncheckedDowncast<SubDerived>(other->obj());
+    newCastFunction<SubDerived>(other->obj());
+    badCastFunction<SubDerived>(other->obj());
+    // expected-warning@-1{{Call argument is uncounted and unsafe}}
     toString(other->obj());
 }
+
+struct SomeStruct {
+  Derived* [[clang::annotate_type("webkit.pointerconversion")]] ptrConversion(Base*);
+
+  void foo(OtherObject& otherObj) {
+    RefPtr ptr = otherObj.base();
+    ptrConversion(ptr.get())->doWork();
+  }
+};
diff --git a/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h b/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h
index 93e7dfd..9e4356a 100644
--- a/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h
+++ b/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h
@@ -22,6 +22,7 @@ typedef struct CF_BRIDGED_TYPE(id) CGImage *CGImageRef;
 #define NS_RETURNS_RETAINED __attribute__((ns_returns_retained))
 #define CF_CONSUMED __attribute__((cf_consumed))
 #define CF_RETURNS_RETAINED __attribute__((cf_returns_retained))
+#define NS_REQUIRES_PROPERTY_DEFINITIONS __attribute__((objc_requires_property_definitions))
 
 extern const CFAllocatorRef kCFAllocatorDefault;
 typedef struct _NSZone NSZone;
diff --git a/clang/test/Analysis/Checkers/WebKit/unchecked-local-vars.cpp b/clang/test/Analysis/Checkers/WebKit/unchecked-local-vars.cpp
index 3bc7523..2984f8b 100644
--- a/clang/test/Analysis/Checkers/WebKit/unchecked-local-vars.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/unchecked-local-vars.cpp
@@ -290,6 +290,28 @@ void foo() {
 
 } // namespace local_assignment_to_global
 
+namespace member_var {
+
+  struct WrapperObj {
+    CheckedObj checked;
+    CheckedObj& checkedRef;
+    void foo() {
+      auto *a = &checked;
+      a->method();
+      auto *b = &checkedRef;
+      // expected-warning@-1{{Local variable 'b' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}}
+      b->method();
+    }
+
+    void bar(WrapperObj& wrapper) {
+      CheckedObj* ptr = &wrapper.checked;
+      // expected-warning@-1{{Local variable 'ptr' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}}
+      ptr->method();
+    }
+  };
+
+}
+
 namespace local_refcountable_checkable_object {
 
 RefCountableAndCheckable* provide_obj();
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-members-arc.mm b/clang/test/Analysis/Checkers/WebKit/unretained-members-arc.mm
index 3491bc9..00e6e6e 100644
--- a/clang/test/Analysis/Checkers/WebKit/unretained-members-arc.mm
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-members-arc.mm
@@ -64,3 +64,39 @@ namespace ptr_to_ptr_to_retained {
   };
 
 } // namespace ptr_to_ptr_to_retained
+
+@interface AnotherObject : NSObject {
+  NSString *ns_string;
+  CFStringRef cf_string;
+  // expected-warning@-1{{Instance variable 'cf_string' in 'AnotherObject' is a retainable type 'CFStringRef'; member variables must be a RetainPtr}}
+}
+@property(nonatomic, strong) NSString *prop_string1;
+@property(nonatomic, assign) NSString *prop_string2;
+// expected-warning@-1{{Property 'prop_string2' in 'AnotherObject' is a raw pointer to retainable type 'NSString'; member variables must be a RetainPtr}}
+@property(nonatomic, unsafe_unretained) NSString *prop_string3;
+// expected-warning@-1{{Property 'prop_string3' in 'AnotherObject' is a raw pointer to retainable type 'NSString'; member variables must be a RetainPtr}}
+@property(nonatomic, readonly) NSString *prop_string4;
+@end
+
+NS_REQUIRES_PROPERTY_DEFINITIONS
+@interface NoSynthObject : NSObject {
+  NSString *ns_string;
+  CFStringRef cf_string;
+  // expected-warning@-1{{Instance variable 'cf_string' in 'NoSynthObject' is a retainable type 'CFStringRef'; member variables must be a RetainPtr}}
+}
+@property(nonatomic, readonly, strong) NSString *prop_string1;
+@property(nonatomic, readonly, strong) NSString *prop_string2;
+@property(nonatomic, assign) NSString *prop_string3;
+// expected-warning@-1{{Property 'prop_string3' in 'NoSynthObject' is a raw pointer to retainable type 'NSString'; member variables must be a RetainPtr}}
+@property(nonatomic, unsafe_unretained) NSString *prop_string4;
+// expected-warning@-1{{Property 'prop_string4' in 'NoSynthObject' is a raw pointer to retainable type 'NSString'; member variables must be a RetainPtr}}
+@end
+
+@implementation NoSynthObject
+- (NSString *)prop_string1 {
+  return nil;
+}
+@synthesize prop_string2;
+@synthesize prop_string3;
+@synthesize prop_string4;
+@end
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-members.mm b/clang/test/Analysis/Checkers/WebKit/unretained-members.mm
index 0cb4c4a..46f65df 100644
--- a/clang/test/Analysis/Checkers/WebKit/unretained-members.mm
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-members.mm
@@ -99,3 +99,28 @@ namespace ptr_to_ptr_to_retained {
 @property(nonatomic, strong) NSString *prop_string;
 // expected-warning@-1{{Property 'prop_string' in 'AnotherObject' is a raw pointer to retainable type 'NSString'; member variables must be a RetainPtr}}
 @end
+
+NS_REQUIRES_PROPERTY_DEFINITIONS
+@interface NoSynthObject : NSObject {
+  NSString *ns_string;
+  // expected-warning@-1{{Instance variable 'ns_string' in 'NoSynthObject' is a raw pointer to retainable type 'NSString'; member variables must be a RetainPtr}}
+  CFStringRef cf_string;
+  // expected-warning@-1{{Instance variable 'cf_string' in 'NoSynthObject' is a retainable type 'CFStringRef'; member variables must be a RetainPtr}}
+}
+@property(nonatomic, readonly, strong) NSString *prop_string1;
+@property(nonatomic, readonly, strong) NSString *prop_string2;
+// expected-warning@-1{{Property 'prop_string2' in 'NoSynthObject' is a raw pointer to retainable type 'NSString'}}
+@property(nonatomic, assign) NSString *prop_string3;
+// expected-warning@-1{{Property 'prop_string3' in 'NoSynthObject' is a raw pointer to retainable type 'NSString'; member variables must be a RetainPtr}}
+@property(nonatomic, unsafe_unretained) NSString *prop_string4;
+// expected-warning@-1{{Property 'prop_string4' in 'NoSynthObject' is a raw pointer to retainable type 'NSString'; member variables must be a RetainPtr}}
+@end
+
+@implementation NoSynthObject
+- (NSString *)prop_string1 {
+  return nil;
+}
+@synthesize prop_string2;
+@synthesize prop_string3;
+@synthesize prop_string4;
+@end
diff --git a/clang/test/C/C23/n3006.c b/clang/test/C/C23/n3006.c
new file mode 100644
index 0000000..e0713fa
--- /dev/null
+++ b/clang/test/C/C23/n3006.c
@@ -0,0 +1,113 @@
+// RUN: %clang_cc1 -std=c23 -verify %s
+
+/* WG14 N3006: Yes
+ * Underspecified object declarations
+ */
+
+void struct_test(void) {
+  struct S1 { int x, y; };                                                  // expected-note {{field 'x' has type 'int' here}}
+
+  auto normal_struct = (struct S1){ 1, 2 };
+  auto normal_struct2 = (struct S1) { .x = 1, .y = 2 };
+  auto underspecified_struct = (struct S2 { int x, y; }){ 1, 2 };
+  auto underspecified_struct_redef = (struct S1 { char x, y; }){ 'A', 'B'}; // expected-error {{type 'struct S1' has incompatible definitions}} \
+                                                                               expected-error {{cannot use 'auto' with array in C}} \
+                                                                               expected-note {{field 'x' has type 'char' here}}
+  auto underspecified_empty_struct = (struct S3 { }){ };
+  auto zero_init_struct = (struct S4 { int x; }){ 0 };
+  int field_struct = (struct S5 { int y; }){ 0 }.y;
+}
+
+void union_test(void) {
+  union U1 { int a; double b; };                                                  // expected-note {{field 'a' has type 'int' here}}
+
+  auto normal_union_int = (union U1){ .a = 12 };
+  auto normal_union_double = (union U1){ .b = 2.4 };
+  auto underspecified_union = (union U2 { int a; double b; }){ .a = 34 };
+  auto underspecified_union_redef = (union U1 { char a; double b; }){ .a = 'A' }; // expected-error {{type 'union U1' has incompatible definitions}} \
+                                                                                     expected-error {{cannot use 'auto' with array in C}} \
+                                                                                     expected-note {{field 'a' has type 'char' here}}
+  auto underspecified_empty_union = (union U3 {  }){  };
+}
+
+void enum_test(void) {
+  enum E1 { FOO, BAR };                                           // expected-note {{enumerator 'BAR' with value 1 here}}
+
+  auto normal_enum_foo = (enum E1){ FOO };
+  auto normal_enum_bar = (enum E1){ BAR };
+  auto underspecified_enum = (enum E2 { BAZ, QUX }){ BAZ };
+  auto underspecified_enum_redef = (enum E1 { ONE, TWO }){ ONE }; // expected-error {{type 'enum E1' has incompatible definitions}} \
+                                                                     expected-error {{cannot use 'auto' with array in C}} \
+                                                                     expected-note {{enumerator 'ONE' with value 0 here}}
+  auto underspecified_empty_enum = (enum E3 {  }){ };             // expected-error {{use of empty enum}}
+  auto underspecified_undeclared_enum = (enum E4){ FOO };         // expected-error {{variable has incomplete type 'enum E4'}} \
+                                                                     expected-note {{forward declaration of 'enum E4'}}
+}
+
+void constexpr_test(void) {
+  constexpr auto ce_struct = (struct S1){ 1, 2 };                   // expected-error {{variable has incomplete type 'struct S1'}} \
+                                                                       expected-note {{forward declaration of 'struct S1'}}
+  constexpr auto ce_struct_zero_init = (struct S2 { int x; }){ 0 };
+  constexpr int ce_struct_field = (struct S3 { int y; }){ 0 }.y;
+  constexpr auto ce_union = (union U1){ .a = 12 };                  // expected-error {{variable has incomplete type 'union U1'}} \
+                                                                       expected-note {{forward declaration of 'union U1'}}
+
+  constexpr auto ce_enum = (enum E1 { BAZ, QUX }){ BAZ };
+  constexpr auto ce_empty_enum = (enum E2){ FOO };                  // expected-error {{use of undeclared identifier 'FOO'}}
+}
+
+void self_reference_test(void) {
+  constexpr int i = i;  // expected-error {{constexpr variable 'i' must be initialized by a constant expression}} \
+                           expected-note {{read of object outside its lifetime is not allowed in a constant expression}}
+  auto j = j;           // expected-error {{variable 'j' declared with deduced type 'auto' cannot appear in its own initializer}}
+}
+
+void redefinition_test(void) {
+  const struct S { int x; } s;  // expected-warning {{default initialization of an object of type 'const struct S' leaves the object uninitialized}} \
+                                   expected-note {{previous definition is here}}
+  constexpr struct S s = {0};   // expected-error {{redefinition of 's'}}
+}
+
+void declaring_an_underspecified_defied_object_test(void) {
+  struct S { int x, y; };
+  constexpr int i = (struct T { int a, b; }){0, 1}.a;
+
+  struct T t = { 1, 2 };
+}
+
+void constexpr_complience_test(void) {
+  int x = (struct Foo { int x; }){ 0 }.x;
+  constexpr int y = (struct Bar { int x; }){ 0 }.x;
+}
+
+void builtin_functions_test(void) {
+  constexpr typeof(struct s *) x = 0;
+  auto so = sizeof(struct S {});
+  auto to = (typeof(struct S {})){};
+}
+
+void misc_test(void) {
+  constexpr struct S { int a, b; } y = { 0 };
+  constexpr int a = 0, b = 0;
+  auto c = (struct T { int x, y; }){0, 0};
+  auto s2 = ({struct T { int x; } s = {}; s.x; });
+  auto s3 = ((struct {}){},0); // expected-warning {{left operand of comma operator has no effect}}
+  constexpr int (*fp)(struct X { int x; } val) = 0;
+  auto v = (void (*)(int y))0;
+}
+
+void misc_struct_test(void) {
+  constexpr struct {
+      int a;
+  } a = {};
+
+  constexpr struct {
+      int b;
+  } b = (struct S { int x; }){ 0 };  // expected-error-re {{initializing 'const struct (unnamed struct at {{.*}}n3006.c:104:13)' with an expression of incompatible type 'struct S'}}
+
+  auto z = ({
+      int a = 12;
+      struct {} s;
+      a;
+  });
+}
diff --git a/clang/test/CIR/CodeGen/class.cpp b/clang/test/CIR/CodeGen/class.cpp
index 7c41bdb..d7f3772 100644
--- a/clang/test/CIR/CodeGen/class.cpp
+++ b/clang/test/CIR/CodeGen/class.cpp
@@ -51,3 +51,52 @@ public:
 
 int use(Derived *d) { return d->b; }
 
+// CIR: cir.func @_Z3useP7Derived(%[[ARG0:.*]]: !cir.ptr<!rec_Derived>
+// CIR:  %[[D_ADDR:.*]] = cir.alloca !cir.ptr<!rec_Derived>, !cir.ptr<!cir.ptr<!rec_Derived>>, ["d", init]
+// CIR:  cir.store %[[ARG0]], %[[D_ADDR]]
+// CIR:  %[[D_PTR:.*]] = cir.load align(8) %0
+// CIR:  %[[D_B_ADDR:.*]] = cir.get_member %[[D_PTR]][1] {name = "b"}
+// CIR:  %[[D_B:.*]] = cir.load align(4) %[[D_B_ADDR]]
+
+// LLVM: define{{.*}} i32 @_Z3useP7Derived
+// LLVM:   getelementptr %class.Derived, ptr %{{.*}}, i32 0, i32 1
+
+// OGCG: define{{.*}} i32 @_Z3useP7Derived
+// OGCG:   getelementptr inbounds nuw %class.Derived, ptr %{{.*}}, i32 0, i32 1
+
+int use_base() {
+  Derived d;
+  return d.a;
+}
+
+// CIR: cir.func @_Z8use_basev
+// CIR:   %[[D_ADDR:.*]] = cir.alloca !rec_Derived, !cir.ptr<!rec_Derived>, ["d"]
+// CIR:   %[[BASE_ADDR:.*]] cir.base_class_addr %[[D_ADDR]] : !cir.ptr<!rec_Derived> nonnull [0] -> !cir.ptr<!rec_Base>
+// CIR:   %[[D_A_ADDR:.*]] = cir.get_member %2[0] {name = "a"} : !cir.ptr<!rec_Base> -> !cir.ptr<!s32i>
+// CIR:   %[[D_A:.*]] = cir.load align(4) %3 : !cir.ptr<!s32i>, !s32i
+
+// LLVM: define{{.*}} i32 @_Z8use_basev
+// LLVM:   %[[D:.*]] = alloca %class.Derived
+// LLVM:   %[[D_A_ADDR:.*]] = getelementptr %class.Base, ptr %[[D]], i32 0, i32 0
+
+// OGCG: define{{.*}} i32 @_Z8use_basev
+// OGCG:   %[[D:.*]] = alloca %class.Derived
+// OGCG:   %[[D_A_ADDR:.*]] = getelementptr inbounds nuw %class.Base, ptr %[[D]], i32 0, i32 0
+
+int use_base_via_pointer(Derived *d) {
+  return d->a;
+}
+
+// CIR: cir.func @_Z20use_base_via_pointerP7Derived(%[[ARG0:.*]]: !cir.ptr<!rec_Derived>
+// CIR:   %[[D_ADDR:.*]] = cir.alloca !cir.ptr<!rec_Derived>, !cir.ptr<!cir.ptr<!rec_Derived>>, ["d", init]
+// CIR:   cir.store %[[ARG0]], %[[D_ADDR]]
+// CIR:   %[[D:.*]] = cir.load align(8) %[[D_ADDR]]
+// CIR:   %[[BASE_ADDR:.*]] = cir.base_class_addr %[[D]] : !cir.ptr<!rec_Derived> nonnull [0] -> !cir.ptr<!rec_Base>
+// CIR:   %[[D_A_ADDR:.*]] = cir.get_member %[[BASE_ADDR]][0] {name = "a"}
+// CIR:   %[[D_A:.*]] = cir.load align(4) %[[D_A_ADDR]]
+
+// LLVM: define{{.*}} i32 @_Z20use_base_via_pointerP7Derived
+// LLVM:   %[[D_A_ADDR:.*]] = getelementptr %class.Base, ptr %{{.*}}, i32 0, i32 0
+
+// OGCG: define{{.*}} i32 @_Z20use_base_via_pointerP7Derived
+// OGCG:   %[[D_A_ADDR:.*]] = getelementptr inbounds nuw %class.Base, ptr %{{.*}}, i32 0, i32 0
diff --git a/clang/test/CIR/CodeGen/enum.cpp b/clang/test/CIR/CodeGen/enum.cpp
new file mode 100644
index 0000000..5d9b105
--- /dev/null
+++ b/clang/test/CIR/CodeGen/enum.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck %s --input-file=%t.cir
+
+enum Numbers {
+  Zero,
+  One,
+  Two,
+  Three
+};
+
+int f() {
+  return Numbers::One;
+}
+
+// CHECK: cir.func{{.*}} @_Z1fv
+// CHECK:    cir.const #cir.int<1> : !u32i
diff --git a/clang/test/CIR/CodeGen/vector-ext.cpp b/clang/test/CIR/CodeGen/vector-ext.cpp
index aa836c2..e1814f2 100644
--- a/clang/test/CIR/CodeGen/vector-ext.cpp
+++ b/clang/test/CIR/CodeGen/vector-ext.cpp
@@ -1116,3 +1116,28 @@ void foo19() {
 // OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16
 // OGCG: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16
 // OGCG: %[[SHUF:.*]] = shufflevector <4 x i32> %[[TMP_A]], <4 x i32> %[[TMP_B]], <4 x i32> <i32 7, i32 5, i32 3, i32 1>
+
+void foo20() {
+  vi4 a;
+  vi4 b;
+  vi4 u = __builtin_shufflevector(a, b, -1, 1, -1, 1);
+}
+
+// CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a"]
+// CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["b"]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CIR: %[[SHUF:.*]] = cir.vec.shuffle(%[[TMP_A]], %[[TMP_B]] : !cir.vector<4 x !s32i>) [#cir.int<-1> :
+// CIR-SAME: !s64i, #cir.int<1> : !s64i, #cir.int<-1> : !s64i, #cir.int<1> : !s64i] : !cir.vector<4 x !s32i>
+
+// LLVM: %[[VEC_A:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[VEC_B:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16
+// LLVM: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16
+// LLVM: %[[SHUF:.*]] = shufflevector <4 x i32> %[[TMP_A]], <4 x i32> %[[TMP_B]], <4 x i32> <i32 poison, i32 1, i32 poison, i32 1>
+
+// OGCG: %[[VEC_A:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[VEC_B:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16
+// OGCG: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16
+// OGCG: %[[SHUF:.*]] = shufflevector <4 x i32> %[[TMP_A]], <4 x i32> %[[TMP_B]], <4 x i32> <i32 poison, i32 1, i32 poison, i32 1>
diff --git a/clang/test/CIR/CodeGen/vector.cpp b/clang/test/CIR/CodeGen/vector.cpp
index f53264c..4f116fa 100644
--- a/clang/test/CIR/CodeGen/vector.cpp
+++ b/clang/test/CIR/CodeGen/vector.cpp
@@ -1150,3 +1150,28 @@ void foo22() {
 
 // OGCG: %[[VEC_COND:.*]] = icmp ne <4 x i32> {{.*}}, zeroinitializer
 // OGCG: %[[RES:.*]] = select <4 x i1> %[[VEC_COND]], <4 x float> {{.*}}, <4 x float> {{.*}}
+
+void foo23() {
+  vi4 a;
+  vi4 b;
+  vi4 u = __builtin_shufflevector(a, b, -1, 1, -1, 1);
+}
+
+// CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a"]
+// CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["b"]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CIR: %[[SHUF:.*]] = cir.vec.shuffle(%[[TMP_A]], %[[TMP_B]] : !cir.vector<4 x !s32i>) [#cir.int<-1> :
+// CIR-SAME: !s64i, #cir.int<1> : !s64i, #cir.int<-1> : !s64i, #cir.int<1> : !s64i] : !cir.vector<4 x !s32i>
+
+// LLVM: %[[VEC_A:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[VEC_B:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16
+// LLVM: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16
+// LLVM: %[[SHUF:.*]] = shufflevector <4 x i32> %[[TMP_A]], <4 x i32> %[[TMP_B]], <4 x i32> <i32 poison, i32 1, i32 poison, i32 1>
+
+// OGCG: %[[VEC_A:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[VEC_B:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16
+// OGCG: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16
+// OGCG: %[[SHUF:.*]] = shufflevector <4 x i32> %[[TMP_A]], <4 x i32> %[[TMP_B]], <4 x i32> <i32 poison, i32 1, i32 poison, i32 1>
diff --git a/clang/test/CXX/drs/cwg24xx.cpp b/clang/test/CXX/drs/cwg24xx.cpp
index 9c9a3f1..c499a2d 100644
--- a/clang/test/CXX/drs/cwg24xx.cpp
+++ b/clang/test/CXX/drs/cwg24xx.cpp
@@ -215,3 +215,34 @@ void (*q)() throw() = S();
 // since-cxx17-error@-1 {{no viable conversion from 'S' to 'void (*)() throw()'}}
 //   since-cxx17-note@#cwg2486-conv {{candidate function}}
 } // namespace cwg2486
+
+
+namespace cwg2496 { // cwg2496: 21
+#if __cplusplus >= 201102L
+struct S {
+    virtual void f(); // #cwg2496-f
+    virtual void g() &; // #cwg2496-g
+    virtual void h(); // #cwg2496-h
+    virtual void i();
+    virtual void j() &;
+    virtual void k() &&;
+    virtual void l() &;
+};
+
+struct T : S {
+    virtual void f() &;
+    // expected-error@-1 {{cannot overload a member function with ref-qualifier '&' with a member function without a ref-qualifier}}
+    // expected-note@#cwg2496-f {{previous declaration is here}}
+    virtual void g();
+    // expected-error@-1 {{cannot overload a member function without a ref-qualifier with a member function with ref-qualifier '&'}}
+    // expected-note@#cwg2496-g {{previous declaration is here}}
+    virtual void h() &&;
+    // expected-error@-1 {{cannot overload a member function with ref-qualifier '&&' with a member function without a ref-qualifier}}
+    // expected-note@#cwg2496-h {{previous declaration is here}}
+    virtual void i();
+    virtual void j() &;
+    virtual void k() &;
+    virtual void l() &&;
+};
+#endif
+}
diff --git a/clang/test/CodeGen/LoongArch/bfloat-abi.c b/clang/test/CodeGen/LoongArch/bfloat-abi.c
new file mode 100644
index 0000000..a8a2529
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/bfloat-abi.c
@@ -0,0 +1,532 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// RUN: %clang_cc1 -triple loongarch64 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LA64
+// RUN: %clang_cc1 -triple loongarch32 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LA32
+
+struct bfloat1 {
+  __bf16 a;
+};
+
+// CHECK-LABEL: define dso_local bfloat @h1
+// CHECK-SAME: (bfloat noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT1:%.*]], align 2
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-NEXT:    store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT1]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    store bfloat [[TMP0]], ptr [[A1]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { bfloat }, ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load bfloat, ptr [[TMP1]], align 2
+// CHECK-NEXT:    ret bfloat [[TMP2]]
+//
+struct bfloat1 h1(__bf16 a) {
+  struct bfloat1 x;
+  x.a = a;
+  return x;
+}
+
+struct bfloat2 {
+  __bf16 a;
+  __bf16 b;
+};
+
+// CHECK-LABEL: define dso_local { bfloat, bfloat } @h2
+// CHECK-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT2:%.*]], align 2
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-NEXT:    store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    store bfloat [[TMP0]], ptr [[A1]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2]], ptr [[RETVAL]], i32 0, i32 1
+// CHECK-NEXT:    store bfloat [[TMP1]], ptr [[B2]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { bfloat, bfloat }, ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load bfloat, ptr [[TMP2]], align 2
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw { bfloat, bfloat }, ptr [[RETVAL]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load bfloat, ptr [[TMP4]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { bfloat, bfloat } poison, bfloat [[TMP3]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { bfloat, bfloat } [[TMP6]], bfloat [[TMP5]], 1
+// CHECK-NEXT:    ret { bfloat, bfloat } [[TMP7]]
+//
+struct bfloat2 h2(__bf16 a, __bf16 b) {
+  struct bfloat2 x;
+  x.a = a;
+  x.b = b;
+  return x;
+}
+
+struct bfloat3 {
+  __bf16 a;
+  __bf16 b;
+  __bf16 c;
+};
+
+// CHECK-LA64-LABEL: define dso_local i64 @h3
+// CHECK-LA64-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-LA64-NEXT:  entry:
+// CHECK-LA64-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT3:%.*]], align 2
+// CHECK-LA64-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[C_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[RETVAL_COERCE:%.*]] = alloca i64, align 8
+// CHECK-LA64-NEXT:    store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-LA64-NEXT:    store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-LA64-NEXT:    store bfloat [[C]], ptr [[C_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-LA64-NEXT:    store bfloat [[TMP0]], ptr [[A1]], align 2
+// CHECK-LA64-NEXT:    [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 1
+// CHECK-LA64-NEXT:    store bfloat [[TMP1]], ptr [[B2]], align 2
+// CHECK-LA64-NEXT:    [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 2
+// CHECK-LA64-NEXT:    store bfloat [[TMP2]], ptr [[C3]], align 2
+// CHECK-LA64-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL_COERCE]], ptr align 2 [[RETVAL]], i64 6, i1 false)
+// CHECK-LA64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[RETVAL_COERCE]], align 8
+// CHECK-LA64-NEXT:    ret i64 [[TMP3]]
+//
+// CHECK-LA32-LABEL: define dso_local [2 x i32] @h3
+// CHECK-LA32-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-LA32-NEXT:  entry:
+// CHECK-LA32-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT3:%.*]], align 2
+// CHECK-LA32-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    [[C_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    [[RETVAL_COERCE:%.*]] = alloca [2 x i32], align 4
+// CHECK-LA32-NEXT:    store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-LA32-NEXT:    store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-LA32-NEXT:    store bfloat [[C]], ptr [[C_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-LA32-NEXT:    store bfloat [[TMP0]], ptr [[A1]], align 2
+// CHECK-LA32-NEXT:    [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 1
+// CHECK-LA32-NEXT:    store bfloat [[TMP1]], ptr [[B2]], align 2
+// CHECK-LA32-NEXT:    [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 2
+// CHECK-LA32-NEXT:    store bfloat [[TMP2]], ptr [[C3]], align 2
+// CHECK-LA32-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[RETVAL_COERCE]], ptr align 2 [[RETVAL]], i32 6, i1 false)
+// CHECK-LA32-NEXT:    [[TMP3:%.*]] = load [2 x i32], ptr [[RETVAL_COERCE]], align 4
+// CHECK-LA32-NEXT:    ret [2 x i32] [[TMP3]]
+//
+struct bfloat3 h3(__bf16 a, __bf16 b, __bf16 c) {
+  struct bfloat3 x;
+  x.a = a;
+  x.b = b;
+  x.c = c;
+  return x;
+}
+
+struct bfloat4 {
+  __bf16 a;
+  __bf16 b;
+  __bf16 c;
+  __bf16 d;
+};
+
+// CHECK-LA64-LABEL: define dso_local i64 @h4
+// CHECK-LA64-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK-LA64-NEXT:  entry:
+// CHECK-LA64-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT4:%.*]], align 2
+// CHECK-LA64-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[C_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[D_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-LA64-NEXT:    store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-LA64-NEXT:    store bfloat [[C]], ptr [[C_ADDR]], align 2
+// CHECK-LA64-NEXT:    store bfloat [[D]], ptr [[D_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-LA64-NEXT:    store bfloat [[TMP0]], ptr [[A1]], align 2
+// CHECK-LA64-NEXT:    [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 1
+// CHECK-LA64-NEXT:    store bfloat [[TMP1]], ptr [[B2]], align 2
+// CHECK-LA64-NEXT:    [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 2
+// CHECK-LA64-NEXT:    store bfloat [[TMP2]], ptr [[C3]], align 2
+// CHECK-LA64-NEXT:    [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[D4:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 3
+// CHECK-LA64-NEXT:    store bfloat [[TMP3]], ptr [[D4]], align 2
+// CHECK-LA64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[RETVAL]], align 2
+// CHECK-LA64-NEXT:    ret i64 [[TMP4]]
+//
+// CHECK-LA32-LABEL: define dso_local [2 x i32] @h4
+// CHECK-LA32-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK-LA32-NEXT:  entry:
+// CHECK-LA32-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT4:%.*]], align 2
+// CHECK-LA32-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    [[C_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    [[D_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-LA32-NEXT:    store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-LA32-NEXT:    store bfloat [[C]], ptr [[C_ADDR]], align 2
+// CHECK-LA32-NEXT:    store bfloat [[D]], ptr [[D_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-LA32-NEXT:    store bfloat [[TMP0]], ptr [[A1]], align 2
+// CHECK-LA32-NEXT:    [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 1
+// CHECK-LA32-NEXT:    store bfloat [[TMP1]], ptr [[B2]], align 2
+// CHECK-LA32-NEXT:    [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 2
+// CHECK-LA32-NEXT:    store bfloat [[TMP2]], ptr [[C3]], align 2
+// CHECK-LA32-NEXT:    [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[D4:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 3
+// CHECK-LA32-NEXT:    store bfloat [[TMP3]], ptr [[D4]], align 2
+// CHECK-LA32-NEXT:    [[TMP4:%.*]] = load [2 x i32], ptr [[RETVAL]], align 2
+// CHECK-LA32-NEXT:    ret [2 x i32] [[TMP4]]
+//
+struct bfloat4 h4(__bf16 a, __bf16 b, __bf16 c, __bf16 d) {
+  struct bfloat4 x;
+  x.a = a;
+  x.b = b;
+  x.c = c;
+  x.d = d;
+  return x;
+}
+
+struct floatbfloat {
+  float a;
+  __bf16 b;
+};
+
+// CHECK-LABEL: define dso_local { float, bfloat } @fh
+// CHECK-SAME: (float noundef [[A:%.*]], bfloat noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOATBFLOAT:%.*]], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-NEXT:    store float [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    store float [[TMP0]], ptr [[A1]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT]], ptr [[RETVAL]], i32 0, i32 1
+// CHECK-NEXT:    store bfloat [[TMP1]], ptr [[B2]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { float, bfloat }, ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP2]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw { float, bfloat }, ptr [[RETVAL]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load bfloat, ptr [[TMP4]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { float, bfloat } poison, float [[TMP3]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { float, bfloat } [[TMP6]], bfloat [[TMP5]], 1
+// CHECK-NEXT:    ret { float, bfloat } [[TMP7]]
+//
+struct floatbfloat fh(float a, __bf16 b) {
+  struct floatbfloat x;
+  x.a = a;
+  x.b = b;
+  return x;
+}
+
+struct floatbfloat2 {
+  float a;
+  __bf16 b;
+  __bf16 c;
+};
+
+// CHECK-LA64-LABEL: define dso_local i64 @fh2
+// CHECK-LA64-SAME: (float noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-LA64-NEXT:  entry:
+// CHECK-LA64-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOATBFLOAT2:%.*]], align 4
+// CHECK-LA64-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4
+// CHECK-LA64-NEXT:    [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[C_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    store float [[A]], ptr [[A_ADDR]], align 4
+// CHECK-LA64-NEXT:    store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-LA64-NEXT:    store bfloat [[C]], ptr [[C_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4
+// CHECK-LA64-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT2]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-LA64-NEXT:    store float [[TMP0]], ptr [[A1]], align 4
+// CHECK-LA64-NEXT:    [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT2]], ptr [[RETVAL]], i32 0, i32 1
+// CHECK-LA64-NEXT:    store bfloat [[TMP1]], ptr [[B2]], align 4
+// CHECK-LA64-NEXT:    [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT2]], ptr [[RETVAL]], i32 0, i32 2
+// CHECK-LA64-NEXT:    store bfloat [[TMP2]], ptr [[C3]], align 2
+// CHECK-LA64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[RETVAL]], align 4
+// CHECK-LA64-NEXT:    ret i64 [[TMP3]]
+//
+// CHECK-LA32-LABEL: define dso_local [2 x i32] @fh2
+// CHECK-LA32-SAME: (float noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-LA32-NEXT:  entry:
+// CHECK-LA32-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOATBFLOAT2:%.*]], align 4
+// CHECK-LA32-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4
+// CHECK-LA32-NEXT:    [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    [[C_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    store float [[A]], ptr [[A_ADDR]], align 4
+// CHECK-LA32-NEXT:    store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-LA32-NEXT:    store bfloat [[C]], ptr [[C_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4
+// CHECK-LA32-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT2]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-LA32-NEXT:    store float [[TMP0]], ptr [[A1]], align 4
+// CHECK-LA32-NEXT:    [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT2]], ptr [[RETVAL]], i32 0, i32 1
+// CHECK-LA32-NEXT:    store bfloat [[TMP1]], ptr [[B2]], align 4
+// CHECK-LA32-NEXT:    [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT2]], ptr [[RETVAL]], i32 0, i32 2
+// CHECK-LA32-NEXT:    store bfloat [[TMP2]], ptr [[C3]], align 2
+// CHECK-LA32-NEXT:    [[TMP3:%.*]] = load [2 x i32], ptr [[RETVAL]], align 4
+// CHECK-LA32-NEXT:    ret [2 x i32] [[TMP3]]
+//
+struct floatbfloat2 fh2(float a, __bf16 b, __bf16 c) {
+  struct floatbfloat2 x;
+  x.a = a;
+  x.b = b;
+  x.c = c;
+  return x;
+}
+
+struct bfloatfloat {
+  __bf16 a;
+  float b;
+};
+
+// CHECK-LABEL: define dso_local { bfloat, float } @hf
+// CHECK-SAME: (bfloat noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_BFLOATFLOAT:%.*]], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca float, align 4
+// CHECK-NEXT:    store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    store float [[B]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOATFLOAT]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    store bfloat [[TMP0]], ptr [[A1]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOATFLOAT]], ptr [[RETVAL]], i32 0, i32 1
+// CHECK-NEXT:    store float [[TMP1]], ptr [[B2]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { bfloat, float }, ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load bfloat, ptr [[TMP2]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw { bfloat, float }, ptr [[RETVAL]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { bfloat, float } poison, bfloat [[TMP3]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { bfloat, float } [[TMP6]], float [[TMP5]], 1
+// CHECK-NEXT:    ret { bfloat, float } [[TMP7]]
+//
+struct bfloatfloat hf(__bf16 a, float b) {
+  struct bfloatfloat x;
+  x.a = a;
+  x.b = b;
+  return x;
+}
+
+struct bfloat2float {
+  __bf16 a;
+  __bf16 b;
+  float c;
+};
+
+// CHECK-LA64-LABEL: define dso_local i64 @h2f
+// CHECK-LA64-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-LA64-NEXT:  entry:
+// CHECK-LA64-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT2FLOAT:%.*]], align 4
+// CHECK-LA64-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[C_ADDR:%.*]] = alloca float, align 4
+// CHECK-LA64-NEXT:    store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-LA64-NEXT:    store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-LA64-NEXT:    store float [[C]], ptr [[C_ADDR]], align 4
+// CHECK-LA64-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2FLOAT]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-LA64-NEXT:    store bfloat [[TMP0]], ptr [[A1]], align 4
+// CHECK-LA64-NEXT:    [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2FLOAT]], ptr [[RETVAL]], i32 0, i32 1
+// CHECK-LA64-NEXT:    store bfloat [[TMP1]], ptr [[B2]], align 2
+// CHECK-LA64-NEXT:    [[TMP2:%.*]] = load float, ptr [[C_ADDR]], align 4
+// CHECK-LA64-NEXT:    [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2FLOAT]], ptr [[RETVAL]], i32 0, i32 2
+// CHECK-LA64-NEXT:    store float [[TMP2]], ptr [[C3]], align 4
+// CHECK-LA64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[RETVAL]], align 4
+// CHECK-LA64-NEXT:    ret i64 [[TMP3]]
+//
+// CHECK-LA32-LABEL: define dso_local [2 x i32] @h2f
+// CHECK-LA32-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-LA32-NEXT:  entry:
+// CHECK-LA32-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT2FLOAT:%.*]], align 4
+// CHECK-LA32-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    [[C_ADDR:%.*]] = alloca float, align 4
+// CHECK-LA32-NEXT:    store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-LA32-NEXT:    store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-LA32-NEXT:    store float [[C]], ptr [[C_ADDR]], align 4
+// CHECK-LA32-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2FLOAT]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-LA32-NEXT:    store bfloat [[TMP0]], ptr [[A1]], align 4
+// CHECK-LA32-NEXT:    [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2FLOAT]], ptr [[RETVAL]], i32 0, i32 1
+// CHECK-LA32-NEXT:    store bfloat [[TMP1]], ptr [[B2]], align 2
+// CHECK-LA32-NEXT:    [[TMP2:%.*]] = load float, ptr [[C_ADDR]], align 4
+// CHECK-LA32-NEXT:    [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2FLOAT]], ptr [[RETVAL]], i32 0, i32 2
+// CHECK-LA32-NEXT:    store float [[TMP2]], ptr [[C3]], align 4
+// CHECK-LA32-NEXT:    [[TMP3:%.*]] = load [2 x i32], ptr [[RETVAL]], align 4
+// CHECK-LA32-NEXT:    ret [2 x i32] [[TMP3]]
+//
+struct bfloat2float h2f(__bf16 a, __bf16 b, float c) {
+  struct bfloat2float x;
+  x.a = a;
+  x.b = b;
+  x.c = c;
+  return x;
+}
+
+struct floatbfloat3 {
+  float a;
+  __bf16 b;
+  __bf16 c;
+  __bf16 d;
+};
+
+// CHECK-LA64-LABEL: define dso_local [2 x i64] @fh3
+// CHECK-LA64-SAME: (float noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK-LA64-NEXT:  entry:
+// CHECK-LA64-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOATBFLOAT3:%.*]], align 4
+// CHECK-LA64-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4
+// CHECK-LA64-NEXT:    [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[C_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[D_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[RETVAL_COERCE:%.*]] = alloca [2 x i64], align 8
+// CHECK-LA64-NEXT:    store float [[A]], ptr [[A_ADDR]], align 4
+// CHECK-LA64-NEXT:    store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-LA64-NEXT:    store bfloat [[C]], ptr [[C_ADDR]], align 2
+// CHECK-LA64-NEXT:    store bfloat [[D]], ptr [[D_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4
+// CHECK-LA64-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-LA64-NEXT:    store float [[TMP0]], ptr [[A1]], align 4
+// CHECK-LA64-NEXT:    [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[RETVAL]], i32 0, i32 1
+// CHECK-LA64-NEXT:    store bfloat [[TMP1]], ptr [[B2]], align 4
+// CHECK-LA64-NEXT:    [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[RETVAL]], i32 0, i32 2
+// CHECK-LA64-NEXT:    store bfloat [[TMP2]], ptr [[C3]], align 2
+// CHECK-LA64-NEXT:    [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[D4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[RETVAL]], i32 0, i32 3
+// CHECK-LA64-NEXT:    store bfloat [[TMP3]], ptr [[D4]], align 4
+// CHECK-LA64-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL_COERCE]], ptr align 4 [[RETVAL]], i64 12, i1 false)
+// CHECK-LA64-NEXT:    [[TMP4:%.*]] = load [2 x i64], ptr [[RETVAL_COERCE]], align 8
+// CHECK-LA64-NEXT:    ret [2 x i64] [[TMP4]]
+//
+// CHECK-LA32-LABEL: define dso_local void @fh3
+// CHECK-LA32-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_FLOATBFLOAT3:%.*]]) align 4 [[AGG_RESULT:%.*]], float noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK-LA32-NEXT:  entry:
+// CHECK-LA32-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// CHECK-LA32-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4
+// CHECK-LA32-NEXT:    [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    [[C_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    [[D_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// CHECK-LA32-NEXT:    store float [[A]], ptr [[A_ADDR]], align 4
+// CHECK-LA32-NEXT:    store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-LA32-NEXT:    store bfloat [[C]], ptr [[C_ADDR]], align 2
+// CHECK-LA32-NEXT:    store bfloat [[D]], ptr [[D_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4
+// CHECK-LA32-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[AGG_RESULT]], i32 0, i32 0
+// CHECK-LA32-NEXT:    store float [[TMP0]], ptr [[A1]], align 4
+// CHECK-LA32-NEXT:    [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[AGG_RESULT]], i32 0, i32 1
+// CHECK-LA32-NEXT:    store bfloat [[TMP1]], ptr [[B2]], align 4
+// CHECK-LA32-NEXT:    [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[AGG_RESULT]], i32 0, i32 2
+// CHECK-LA32-NEXT:    store bfloat [[TMP2]], ptr [[C3]], align 2
+// CHECK-LA32-NEXT:    [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[D4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[AGG_RESULT]], i32 0, i32 3
+// CHECK-LA32-NEXT:    store bfloat [[TMP3]], ptr [[D4]], align 4
+// CHECK-LA32-NEXT:    ret void
+//
+struct floatbfloat3 fh3(float a, __bf16 b, __bf16 c, __bf16 d) {
+  struct floatbfloat3 x;
+  x.a = a;
+  x.b = b;
+  x.c = c;
+  x.d = d;
+  return x;
+}
+
+struct bfloat5 {
+  __bf16 a;
+  __bf16 b;
+  __bf16 c;
+  __bf16 d;
+  __bf16 e;
+};
+
+// CHECK-LA64-LABEL: define dso_local [2 x i64] @h5
+// CHECK-LA64-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]], bfloat noundef [[E:%.*]]) #[[ATTR0]] {
+// CHECK-LA64-NEXT:  entry:
+// CHECK-LA64-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT5:%.*]], align 2
+// CHECK-LA64-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[C_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[D_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[E_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA64-NEXT:    [[RETVAL_COERCE:%.*]] = alloca [2 x i64], align 8
+// CHECK-LA64-NEXT:    store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-LA64-NEXT:    store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-LA64-NEXT:    store bfloat [[C]], ptr [[C_ADDR]], align 2
+// CHECK-LA64-NEXT:    store bfloat [[D]], ptr [[D_ADDR]], align 2
+// CHECK-LA64-NEXT:    store bfloat [[E]], ptr [[E_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-LA64-NEXT:    store bfloat [[TMP0]], ptr [[A1]], align 2
+// CHECK-LA64-NEXT:    [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[RETVAL]], i32 0, i32 1
+// CHECK-LA64-NEXT:    store bfloat [[TMP1]], ptr [[B2]], align 2
+// CHECK-LA64-NEXT:    [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[RETVAL]], i32 0, i32 2
+// CHECK-LA64-NEXT:    store bfloat [[TMP2]], ptr [[C3]], align 2
+// CHECK-LA64-NEXT:    [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[D4:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[RETVAL]], i32 0, i32 3
+// CHECK-LA64-NEXT:    store bfloat [[TMP3]], ptr [[D4]], align 2
+// CHECK-LA64-NEXT:    [[TMP4:%.*]] = load bfloat, ptr [[E_ADDR]], align 2
+// CHECK-LA64-NEXT:    [[E5:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[RETVAL]], i32 0, i32 4
+// CHECK-LA64-NEXT:    store bfloat [[TMP4]], ptr [[E5]], align 2
+// CHECK-LA64-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL_COERCE]], ptr align 2 [[RETVAL]], i64 10, i1 false)
+// CHECK-LA64-NEXT:    [[TMP5:%.*]] = load [2 x i64], ptr [[RETVAL_COERCE]], align 8
+// CHECK-LA64-NEXT:    ret [2 x i64] [[TMP5]]
+//
+// CHECK-LA32-LABEL: define dso_local void @h5
+// CHECK-LA32-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_BFLOAT5:%.*]]) align 2 [[AGG_RESULT:%.*]], bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]], bfloat noundef [[E:%.*]]) #[[ATTR0]] {
+// CHECK-LA32-NEXT:  entry:
+// CHECK-LA32-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// CHECK-LA32-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    [[C_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    [[D_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    [[E_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-LA32-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// CHECK-LA32-NEXT:    store bfloat [[A]], ptr [[A_ADDR]], align 2
+// CHECK-LA32-NEXT:    store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-LA32-NEXT:    store bfloat [[C]], ptr [[C_ADDR]], align 2
+// CHECK-LA32-NEXT:    store bfloat [[D]], ptr [[D_ADDR]], align 2
+// CHECK-LA32-NEXT:    store bfloat [[E]], ptr [[E_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[AGG_RESULT]], i32 0, i32 0
+// CHECK-LA32-NEXT:    store bfloat [[TMP0]], ptr [[A1]], align 2
+// CHECK-LA32-NEXT:    [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[AGG_RESULT]], i32 0, i32 1
+// CHECK-LA32-NEXT:    store bfloat [[TMP1]], ptr [[B2]], align 2
+// CHECK-LA32-NEXT:    [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[AGG_RESULT]], i32 0, i32 2
+// CHECK-LA32-NEXT:    store bfloat [[TMP2]], ptr [[C3]], align 2
+// CHECK-LA32-NEXT:    [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[D4:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[AGG_RESULT]], i32 0, i32 3
+// CHECK-LA32-NEXT:    store bfloat [[TMP3]], ptr [[D4]], align 2
+// CHECK-LA32-NEXT:    [[TMP4:%.*]] = load bfloat, ptr [[E_ADDR]], align 2
+// CHECK-LA32-NEXT:    [[E5:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[AGG_RESULT]], i32 0, i32 4
+// CHECK-LA32-NEXT:    store bfloat [[TMP4]], ptr [[E5]], align 2
+// CHECK-LA32-NEXT:    ret void
+//
+struct bfloat5 h5(__bf16 a, __bf16 b, __bf16 c, __bf16 d, __bf16 e) {
+  struct bfloat5 x;
+  x.a = a;
+  x.b = b;
+  x.c = c;
+  x.d = d;
+  x.e = e;
+  return x;
+}
diff --git a/clang/test/CodeGen/LoongArch/bfloat-mangle.cpp b/clang/test/CodeGen/LoongArch/bfloat-mangle.cpp
new file mode 100644
index 0000000..de4a10db
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/bfloat-mangle.cpp
@@ -0,0 +1,12 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// RUN: %clang_cc1 -triple loongarch64 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple loongarch32 -emit-llvm -o - %s | FileCheck %s
+
+// CHECK-LABEL: define dso_local void @_Z3fooDF16b
+// CHECK-SAME: (bfloat noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca bfloat, align 2
+// CHECK-NEXT:    store bfloat [[B]], ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    ret void
+//
+void foo(__bf16 b) {}
diff --git a/clang/test/CodeGenCXX/tmp-md-nodes2.cpp b/clang/test/CodeGenCXX/tmp-md-nodes2.cpp
index e508188..8500cf3 100644
--- a/clang/test/CodeGenCXX/tmp-md-nodes2.cpp
+++ b/clang/test/CodeGenCXX/tmp-md-nodes2.cpp
@@ -1,8 +1,6 @@
 // REQUIRES: asserts
 // RUN: %clang_cc1 -O0 -triple %itanium_abi_triple -debug-info-kind=limited -emit-llvm %s -o - | \
 // RUN: FileCheck %s
-// RUN: %clang_cc1 -O0 -triple %itanium_abi_triple -debug-info-kind=limited -emit-llvm -mllvm --experimental-debuginfo-iterators=true %s -o - | \
-// RUN: FileCheck %s
 
 // This test simply checks that the varargs thunk is created. The failing test
 // case asserts.
diff --git a/clang/test/Driver/darwin-ld.c b/clang/test/Driver/darwin-ld.c
index f0ca411..9a8d98c 100644
--- a/clang/test/Driver/darwin-ld.c
+++ b/clang/test/Driver/darwin-ld.c
@@ -240,6 +240,15 @@
 // RUN: FileCheck -check-prefix=LINK_NO_IOS_ARM64_LIBGCC_S %s < %t.log
 // LINK_NO_IOS_ARM64_LIBGCC_S-NOT: lgcc_s.1
 
+// Check that clang links with libgcc_s.1 for Mac OS X 10.5 and earlier, but not arm64
+// RUN: %clang -target x86_64-apple-macosx10.5 -mmacosx-version-min=10.5 -### %t.o 2> %t.log
+// RUN: FileCheck -check-prefix=LINK_OSX_LIBGCC_S %s < %t.log
+// LINK_OSX_LIBGCC_S: lgcc_s.1
+
+// RUN: %clang -target arm64-apple-macosx10.5 -mmacosx-version-min=10.5 -### %t.o 2> %t.log
+// RUN: FileCheck -check-prefix=LINK_NO_OSX_ARM64_LIBGCC_S %s < %t.log
+// LINK_NO_OSX_ARM64_LIBGCC_S-NOT: lgcc_s.1
+
 // RUN: %clang -target x86_64-apple-darwin12 -rdynamic -### %t.o \
 // RUN:   -fuse-ld= -mlinker-version=100 2> %t.log
 // RUN: FileCheck -check-prefix=LINK_NO_EXPORT_DYNAMIC %s < %t.log
@@ -385,4 +394,4 @@
 // RUN:   %clang -target armv7em-apple-darwin -mno-outline -### %t.o 2> %t.log
 // RUN: FileCheck -check-prefix=ARMV7EM-MNO_OUTLINE %s < %t.log
 // ARMV7EM-MNO_OUTLINE: {{ld(.exe)?"}}
-// ARMV7EM-MNO_OUTLINE-SAME: "-mllvm" "-enable-machine-outliner=never" "-mllvm" "-enable-linkonceodr-outlining"
-\ No newline at end of file
+// ARMV7EM-MNO_OUTLINE-SAME: "-mllvm" "-enable-machine-outliner=never" "-mllvm" "-enable-linkonceodr-outlining"
diff --git a/clang/test/Driver/hip-binding.hip b/clang/test/Driver/hip-binding.hip
index d8b3f1e..57e5719 100644
--- a/clang/test/Driver/hip-binding.hip
+++ b/clang/test/Driver/hip-binding.hip
@@ -93,7 +93,7 @@
 // RUN:        -nogpulib -nogpuinc -foffload-lto --offload-arch=gfx90a --offload-arch=gfx908 -c %s 2>&1 \
 // RUN: | FileCheck -check-prefix=LTO-NO-RDC %s
 //      LTO-NO-RDC: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[LTO_908:.+]]"
+// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_908]]"], output: "[[OBJ_908:.+]]"
 // LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]"], output: "[[LTO_90A:.+]]"
-// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
-// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[PKG]]"], output: "[[OBJ:.+]]"
-// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[OBJ]]"], output: "hip-binding.o"
+// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_90A]]"], output: "[[OBJ_90A:.+]]"
+// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[OBJ_908]]", "[[OBJ_90A]]"], output: "[[HIPFB:.+]]"
diff --git a/clang/test/Driver/hip-phases.hip b/clang/test/Driver/hip-phases.hip
index 996d72e..5fd2c02 100644
--- a/clang/test/Driver/hip-phases.hip
+++ b/clang/test/Driver/hip-phases.hip
@@ -8,50 +8,39 @@
 //
 // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDN %s
+// RUN: | FileCheck -check-prefixes=BIN,NRD,OLD %s
 // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWN %s
+// RUN: | FileCheck -check-prefixes=BIN,NRD,NEW %s
 //
 // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDR %s
-// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
-// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWR %s
+// RUN: | FileCheck -check-prefixes=BIN,RDC %s
 //
 // BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (host-[[T]])
 // BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
 // BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
-// OLDR-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
-// OLDR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// RDC-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// RDC-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
 
 // BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH:gfx803]])
 // BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
 // BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
-// OLDN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
-// NEW-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
-// OLDN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
-// OLDR-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
-// OLD-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
-// OLD-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
-// NEW-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, ir
-// OLDN-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
-// NEW-DAG: [[P10:[0-9]+]]: clang-offload-packager, {[[P9]]}, image, (device-[[T]])
-// OLDR-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
-
-// OLDN-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
-// NEW-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P10]]}, ir
-// OLDR-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
-// OLDN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// OLDN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
-// NEW-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// NEW-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
-// OLDN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
-// NEWN-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
-// OLDR-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
-// NEWR-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
-// NEWN-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image
+// NRD-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
+// NRD-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
+// RDC-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
+// BIN-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
+// BIN-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
+// NRD-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
+// RDC-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
+
+// NRD-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
+// RDC-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
+// NRD-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// NRD-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// OLD-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
+// NEW-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
+// RDC-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
 
 //
 // Test single gpu architecture up to the assemble phase.
diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip
index ddd251b..6c69d1d 100644
--- a/clang/test/Driver/hip-toolchain-no-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-no-rdc.hip
@@ -7,7 +7,7 @@
 // RUN:   -fuse-ld=lld -B%S/Inputs/lld -nogpuinc \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LINK,OLD %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LINK %s
 
 // RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc \
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
@@ -17,7 +17,7 @@
 // RUN:   -fuse-ld=lld -B%S/Inputs/lld -nogpuinc -c \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,OLD %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
 
 // RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc \
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
@@ -27,7 +27,7 @@
 // RUN:   -fuse-ld=lld -B%S/Inputs/lld -nogpuinc --offload-new-driver -c \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,NEW %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
 
 // RUN: touch %t/a.o %t/b.o
 // RUN: %clang -### --target=x86_64-linux-gnu \
@@ -47,23 +47,22 @@
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// OLD-SAME: "-emit-obj"
-// NEW-SAME: "-emit-llvm-bc"
+// CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx803"
-// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_A_803:.*(o|bc)]]" "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_803:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// OLD: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// OLD-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" "[[OBJ_DEV_A_803]]"
+// CHECK: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" [[OBJ_DEV_A_803]]
 
 //
 // Compile device code in a.cu to code object for gfx900.
@@ -71,71 +70,62 @@
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
+// CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx900"
-// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_A_900:.*(o|bc)]]" "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_900:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// OLD-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" "[[OBJ_DEV_A_900]]"
+// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" [[OBJ_DEV_A_900]]
 
 //
 // Bundle and embed device code in host object for a.cu.
 //
 
-// OLD: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// OLD-SAME: "-bundle-align=4096"
-// OLD-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
-// OLD-SAME: "-input={{.*}}" "-input=[[IMG_DEV_A_803]]" "-input=[[IMG_DEV_A_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
-
-// NEW: [[PACKAGER:".*clang-offload-packager"]] "-o" "[[PACKAGE_A:.*.out]]"
-// NEW-SAME: "--image=file=[[OBJ_DEV_A_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
-// NEW-SAME: "--image=file=[[OBJ_DEV_A_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
+// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// CHECK-SAME: "-bundle-align=4096"
+// CHECK-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
+// CHECK-SAME: "-input={{.*}}" "-input=[[IMG_DEV_A_803]]" "-input=[[IMG_DEV_A_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
 
 // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
-// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
-// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_A]]"
-// OLD-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
-// NEW-SAME: {{.*}} "-o" [[A_OBJ_HOST_TMP:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
+// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC]]
 
-// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
-// NEW:   "--linker-path={{.*}}" "-o" [[A_OBJ_HOST:".*o"]] [[A_OBJ_HOST_TMP]] "-r"
-
 //
 // Compile device code in b.hip to code object for gfx803.
 //
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
+// CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx803"
-// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_B_803:.*(o|bc)]]" "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_B_803:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// OLD-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" "[[OBJ_DEV_B_803]]"
+// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" [[OBJ_DEV_B_803]]
 
 //
 // Compile device code in b.hip to code object for gfx900.
@@ -143,49 +133,40 @@
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
+// CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx900"
-// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_B_900:.*(o|bc)]]" "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_B_900:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// OLD-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" "[[OBJ_DEV_B_900]]"
+// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" [[OBJ_DEV_B_900]]
 
 //
 // Bundle and embed device code in host object for b.hip.
 //
 
-// OLD: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// OLD-SAME: "-bundle-align=4096"
-// OLD-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
-// OLD-SAME: "-input={{.*}}" "-input=[[IMG_DEV_B_803]]" "-input=[[IMG_DEV_B_900]]" "-output=[[BUNDLE_B:.*hipfb]]"
-
-// NEW: [[PACKAGER:".*clang-offload-packager"]] "-o" "[[PACKAGE_B:.*.out]]"
-// NEW-SAME: "--image=file=[[OBJ_DEV_B_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
-// NEW-SAME: "--image=file=[[OBJ_DEV_B_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
+// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// CHECK-SAME: "-bundle-align=4096"
+// CHECK-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
+// CHECK-SAME: "-input={{.*}}" "-input=[[IMG_DEV_B_803]]" "-input=[[IMG_DEV_B_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
 
 // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
-// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_B]]"
-// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_B]]"
-// OLD-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
-// NEW-SAME: {{.*}} "-o" [[B_OBJ_HOST_TMP:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
+// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC]]
 
-// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
-// NEW:   "--linker-path={{.*}}" "-o" [[B_OBJ_HOST:".*o"]] [[B_OBJ_HOST_TMP]] "-r"
-
 //
 // Link host objects.
 //
diff --git a/clang/test/Preprocessor/init-x86.c b/clang/test/Preprocessor/init-x86.c
index cb77b55..8ea4ce7 100644
--- a/clang/test/Preprocessor/init-x86.c
+++ b/clang/test/Preprocessor/init-x86.c
@@ -1486,3 +1486,431 @@
 
 // RUN: %clang_cc1 -E -dM -triple=i386-unknown-openbsd -x c++ < /dev/null | FileCheck -match-full-lines -check-prefix I386-OPENBSD-CXX %s
 // I386-OPENBSD-CXX: #define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 16UL
+
+// RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=i386-pc-windows-cygnus -target-cpu pentium4 < /dev/null | FileCheck -match-full-lines -check-prefix I386-CYGWIN %s
+// I386-CYGWIN-NOT:#define _LP64
+// I386-CYGWIN:#define __BIGGEST_ALIGNMENT__ 16
+// I386-CYGWIN:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+// I386-CYGWIN:#define __CHAR16_TYPE__ unsigned short
+// I386-CYGWIN:#define __CHAR32_TYPE__ unsigned int
+// I386-CYGWIN:#define __CHAR_BIT__ 8
+// I386-CYGWIN:#define __CYGWIN32__ 1
+// I386-CYGWIN:#define __CYGWIN__ 1
+// I386-CYGWIN:#define __DBL_DENORM_MIN__ 4.9406564584124654e-324
+// I386-CYGWIN:#define __DBL_DIG__ 15
+// I386-CYGWIN:#define __DBL_EPSILON__ 2.2204460492503131e-16
+// I386-CYGWIN:#define __DBL_HAS_DENORM__ 1
+// I386-CYGWIN:#define __DBL_HAS_INFINITY__ 1
+// I386-CYGWIN:#define __DBL_HAS_QUIET_NAN__ 1
+// I386-CYGWIN:#define __DBL_MANT_DIG__ 53
+// I386-CYGWIN:#define __DBL_MAX_10_EXP__ 308
+// I386-CYGWIN:#define __DBL_MAX_EXP__ 1024
+// I386-CYGWIN:#define __DBL_MAX__ 1.7976931348623157e+308
+// I386-CYGWIN:#define __DBL_MIN_10_EXP__ (-307)
+// I386-CYGWIN:#define __DBL_MIN_EXP__ (-1021)
+// I386-CYGWIN:#define __DBL_MIN__ 2.2250738585072014e-308
+// I386-CYGWIN:#define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__
+// I386-CYGWIN:#define __FLT_DENORM_MIN__ 1.40129846e-45F
+// I386-CYGWIN:#define __FLT_DIG__ 6
+// I386-CYGWIN:#define __FLT_EPSILON__ 1.19209290e-7F
+// I386-CYGWIN:#define __FLT_HAS_DENORM__ 1
+// I386-CYGWIN:#define __FLT_HAS_INFINITY__ 1
+// I386-CYGWIN:#define __FLT_HAS_QUIET_NAN__ 1
+// I386-CYGWIN:#define __FLT_MANT_DIG__ 24
+// I386-CYGWIN:#define __FLT_MAX_10_EXP__ 38
+// I386-CYGWIN:#define __FLT_MAX_EXP__ 128
+// I386-CYGWIN:#define __FLT_MAX__ 3.40282347e+38F
+// I386-CYGWIN:#define __FLT_MIN_10_EXP__ (-37)
+// I386-CYGWIN:#define __FLT_MIN_EXP__ (-125)
+// I386-CYGWIN:#define __FLT_MIN__ 1.17549435e-38F
+// I386-CYGWIN:#define __FLT_RADIX__ 2
+// I386-CYGWIN:#define __GCC_ATOMIC_BOOL_LOCK_FREE 2
+// I386-CYGWIN:#define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2
+// I386-CYGWIN:#define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2
+// I386-CYGWIN:#define __GCC_ATOMIC_CHAR_LOCK_FREE 2
+// I386-CYGWIN:#define __GCC_ATOMIC_INT_LOCK_FREE 2
+// I386-CYGWIN:#define __GCC_ATOMIC_LLONG_LOCK_FREE 2
+// I386-CYGWIN:#define __GCC_ATOMIC_LONG_LOCK_FREE 2
+// I386-CYGWIN:#define __GCC_ATOMIC_POINTER_LOCK_FREE 2
+// I386-CYGWIN:#define __GCC_ATOMIC_SHORT_LOCK_FREE 2
+// I386-CYGWIN:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
+// I386-CYGWIN:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2
+// I386-CYGWIN:#define __ILP32__ 1
+// I386-CYGWIN:#define __INT16_C(c) c
+// I386-CYGWIN:#define __INT16_C_SUFFIX__
+// I386-CYGWIN:#define __INT16_FMTd__ "hd"
+// I386-CYGWIN:#define __INT16_FMTi__ "hi"
+// I386-CYGWIN:#define __INT16_MAX__ 32767
+// I386-CYGWIN:#define __INT16_TYPE__ short
+// I386-CYGWIN:#define __INT32_C(c) c
+// I386-CYGWIN:#define __INT32_C_SUFFIX__
+// I386-CYGWIN:#define __INT32_FMTd__ "d"
+// I386-CYGWIN:#define __INT32_FMTi__ "i"
+// I386-CYGWIN:#define __INT32_MAX__ 2147483647
+// I386-CYGWIN:#define __INT32_TYPE__ int
+// I386-CYGWIN:#define __INT64_C(c) c##LL
+// I386-CYGWIN:#define __INT64_C_SUFFIX__ LL
+// I386-CYGWIN:#define __INT64_FMTd__ "lld"
+// I386-CYGWIN:#define __INT64_FMTi__ "lli"
+// I386-CYGWIN:#define __INT64_MAX__ 9223372036854775807LL
+// I386-CYGWIN:#define __INT64_TYPE__ long long int
+// I386-CYGWIN:#define __INT8_C(c) c
+// I386-CYGWIN:#define __INT8_C_SUFFIX__
+// I386-CYGWIN:#define __INT8_FMTd__ "hhd"
+// I386-CYGWIN:#define __INT8_FMTi__ "hhi"
+// I386-CYGWIN:#define __INT8_MAX__ 127
+// I386-CYGWIN:#define __INT8_TYPE__ signed char
+// I386-CYGWIN:#define __INTMAX_C(c) c##LL
+// I386-CYGWIN:#define __INTMAX_C_SUFFIX__ LL
+// I386-CYGWIN:#define __INTMAX_FMTd__ "lld"
+// I386-CYGWIN:#define __INTMAX_FMTi__ "lli"
+// I386-CYGWIN:#define __INTMAX_MAX__ 9223372036854775807LL
+// I386-CYGWIN:#define __INTMAX_TYPE__ long long int
+// I386-CYGWIN:#define __INTMAX_WIDTH__ 64
+// I386-CYGWIN:#define __INTPTR_FMTd__ "d"
+// I386-CYGWIN:#define __INTPTR_FMTi__ "i"
+// I386-CYGWIN:#define __INTPTR_MAX__ 2147483647
+// I386-CYGWIN:#define __INTPTR_TYPE__ int
+// I386-CYGWIN:#define __INTPTR_WIDTH__ 32
+// I386-CYGWIN:#define __INT_FAST16_FMTd__ "hd"
+// I386-CYGWIN:#define __INT_FAST16_FMTi__ "hi"
+// I386-CYGWIN:#define __INT_FAST16_MAX__ 32767
+// I386-CYGWIN:#define __INT_FAST16_TYPE__ short
+// I386-CYGWIN:#define __INT_FAST32_FMTd__ "d"
+// I386-CYGWIN:#define __INT_FAST32_FMTi__ "i"
+// I386-CYGWIN:#define __INT_FAST32_MAX__ 2147483647
+// I386-CYGWIN:#define __INT_FAST32_TYPE__ int
+// I386-CYGWIN:#define __INT_FAST64_FMTd__ "lld"
+// I386-CYGWIN:#define __INT_FAST64_FMTi__ "lli"
+// I386-CYGWIN:#define __INT_FAST64_MAX__ 9223372036854775807LL
+// I386-CYGWIN:#define __INT_FAST64_TYPE__ long long int
+// I386-CYGWIN:#define __INT_FAST8_FMTd__ "hhd"
+// I386-CYGWIN:#define __INT_FAST8_FMTi__ "hhi"
+// I386-CYGWIN:#define __INT_FAST8_MAX__ 127
+// I386-CYGWIN:#define __INT_FAST8_TYPE__ signed char
+// I386-CYGWIN:#define __INT_LEAST16_FMTd__ "hd"
+// I386-CYGWIN:#define __INT_LEAST16_FMTi__ "hi"
+// I386-CYGWIN:#define __INT_LEAST16_MAX__ 32767
+// I386-CYGWIN:#define __INT_LEAST16_TYPE__ short
+// I386-CYGWIN:#define __INT_LEAST32_FMTd__ "d"
+// I386-CYGWIN:#define __INT_LEAST32_FMTi__ "i"
+// I386-CYGWIN:#define __INT_LEAST32_MAX__ 2147483647
+// I386-CYGWIN:#define __INT_LEAST32_TYPE__ int
+// I386-CYGWIN:#define __INT_LEAST64_FMTd__ "lld"
+// I386-CYGWIN:#define __INT_LEAST64_FMTi__ "lli"
+// I386-CYGWIN:#define __INT_LEAST64_MAX__ 9223372036854775807LL
+// I386-CYGWIN:#define __INT_LEAST64_TYPE__ long long int
+// I386-CYGWIN:#define __INT_LEAST8_FMTd__ "hhd"
+// I386-CYGWIN:#define __INT_LEAST8_FMTi__ "hhi"
+// I386-CYGWIN:#define __INT_LEAST8_MAX__ 127
+// I386-CYGWIN:#define __INT_LEAST8_TYPE__ signed char
+// I386-CYGWIN:#define __INT_MAX__ 2147483647
+// I386-CYGWIN:#define __LDBL_DENORM_MIN__ 3.64519953188247460253e-4951L
+// I386-CYGWIN:#define __LDBL_DIG__ 18
+// I386-CYGWIN:#define __LDBL_EPSILON__ 1.08420217248550443401e-19L
+// I386-CYGWIN:#define __LDBL_HAS_DENORM__ 1
+// I386-CYGWIN:#define __LDBL_HAS_INFINITY__ 1
+// I386-CYGWIN:#define __LDBL_HAS_QUIET_NAN__ 1
+// I386-CYGWIN:#define __LDBL_MANT_DIG__ 64
+// I386-CYGWIN:#define __LDBL_MAX_10_EXP__ 4932
+// I386-CYGWIN:#define __LDBL_MAX_EXP__ 16384
+// I386-CYGWIN:#define __LDBL_MAX__ 1.18973149535723176502e+4932L
+// I386-CYGWIN:#define __LDBL_MIN_10_EXP__ (-4931)
+// I386-CYGWIN:#define __LDBL_MIN_EXP__ (-16381)
+// I386-CYGWIN:#define __LDBL_MIN__ 3.36210314311209350626e-4932L
+// I386-CYGWIN:#define __LITTLE_ENDIAN__ 1
+// I386-CYGWIN:#define __LONG_LONG_MAX__ 9223372036854775807LL
+// I386-CYGWIN:#define __LONG_MAX__ 2147483647L
+// I386-CYGWIN-NOT:#define __LP64__
+// I386-CYGWIN:#define __NO_MATH_INLINES 1
+// I386-CYGWIN:#define __POINTER_WIDTH__ 32
+// I386-CYGWIN:#define __PTRDIFF_TYPE__ int
+// I386-CYGWIN:#define __PTRDIFF_WIDTH__ 32
+// I386-CYGWIN:#define __REGISTER_PREFIX__
+// I386-CYGWIN:#define __SCHAR_MAX__ 127
+// I386-CYGWIN:#define __SHRT_MAX__ 32767
+// I386-CYGWIN:#define __SIG_ATOMIC_MAX__ 2147483647
+// I386-CYGWIN:#define __SIG_ATOMIC_WIDTH__ 32
+// I386-CYGWIN:#define __SIZEOF_DOUBLE__ 8
+// I386-CYGWIN:#define __SIZEOF_FLOAT__ 4
+// I386-CYGWIN:#define __SIZEOF_INT__ 4
+// I386-CYGWIN:#define __SIZEOF_LONG_DOUBLE__ 12
+// I386-CYGWIN:#define __SIZEOF_LONG_LONG__ 8
+// I386-CYGWIN:#define __SIZEOF_LONG__ 4
+// I386-CYGWIN:#define __SIZEOF_POINTER__ 4
+// I386-CYGWIN:#define __SIZEOF_PTRDIFF_T__ 4
+// I386-CYGWIN:#define __SIZEOF_SHORT__ 2
+// I386-CYGWIN:#define __SIZEOF_SIZE_T__ 4
+// I386-CYGWIN:#define __SIZEOF_WCHAR_T__ 2
+// I386-CYGWIN:#define __SIZEOF_WINT_T__ 4
+// I386-CYGWIN:#define __SIZE_MAX__ 4294967295U
+// I386-CYGWIN:#define __SIZE_TYPE__ unsigned int
+// I386-CYGWIN:#define __SIZE_WIDTH__ 32
+// I386-CYGWIN:#define __UINT16_C(c) c
+// I386-CYGWIN:#define __UINT16_C_SUFFIX__
+// I386-CYGWIN:#define __UINT16_MAX__ 65535
+// I386-CYGWIN:#define __UINT16_TYPE__ unsigned short
+// I386-CYGWIN:#define __UINT32_C(c) c##U
+// I386-CYGWIN:#define __UINT32_C_SUFFIX__ U
+// I386-CYGWIN:#define __UINT32_MAX__ 4294967295U
+// I386-CYGWIN:#define __UINT32_TYPE__ unsigned int
+// I386-CYGWIN:#define __UINT64_C(c) c##ULL
+// I386-CYGWIN:#define __UINT64_C_SUFFIX__ ULL
+// I386-CYGWIN:#define __UINT64_MAX__ 18446744073709551615ULL
+// I386-CYGWIN:#define __UINT64_TYPE__ long long unsigned int
+// I386-CYGWIN:#define __UINT8_C(c) c
+// I386-CYGWIN:#define __UINT8_C_SUFFIX__
+// I386-CYGWIN:#define __UINT8_MAX__ 255
+// I386-CYGWIN:#define __UINT8_TYPE__ unsigned char
+// I386-CYGWIN:#define __UINTMAX_C(c) c##ULL
+// I386-CYGWIN:#define __UINTMAX_C_SUFFIX__ ULL
+// I386-CYGWIN:#define __UINTMAX_MAX__ 18446744073709551615ULL
+// I386-CYGWIN:#define __UINTMAX_TYPE__ long long unsigned int
+// I386-CYGWIN:#define __UINTMAX_WIDTH__ 64
+// I386-CYGWIN:#define __UINTPTR_MAX__ 4294967295U
+// I386-CYGWIN:#define __UINTPTR_TYPE__ unsigned int
+// I386-CYGWIN:#define __UINTPTR_WIDTH__ 32
+// I386-CYGWIN:#define __UINT_FAST16_MAX__ 65535
+// I386-CYGWIN:#define __UINT_FAST16_TYPE__ unsigned short
+// I386-CYGWIN:#define __UINT_FAST32_MAX__ 4294967295U
+// I386-CYGWIN:#define __UINT_FAST32_TYPE__ unsigned int
+// I386-CYGWIN:#define __UINT_FAST64_MAX__ 18446744073709551615ULL
+// I386-CYGWIN:#define __UINT_FAST64_TYPE__ long long unsigned int
+// I386-CYGWIN:#define __UINT_FAST8_MAX__ 255
+// I386-CYGWIN:#define __UINT_FAST8_TYPE__ unsigned char
+// I386-CYGWIN:#define __UINT_LEAST16_MAX__ 65535
+// I386-CYGWIN:#define __UINT_LEAST16_TYPE__ unsigned short
+// I386-CYGWIN:#define __UINT_LEAST32_MAX__ 4294967295U
+// I386-CYGWIN:#define __UINT_LEAST32_TYPE__ unsigned int
+// I386-CYGWIN:#define __UINT_LEAST64_MAX__ 18446744073709551615ULL
+// I386-CYGWIN:#define __UINT_LEAST64_TYPE__ long long unsigned int
+// I386-CYGWIN:#define __UINT_LEAST8_MAX__ 255
+// I386-CYGWIN:#define __UINT_LEAST8_TYPE__ unsigned char
+// I386-CYGWIN:#define __USER_LABEL_PREFIX__ _
+// I386-CYGWIN:#define __WCHAR_MAX__ 65535
+// I386-CYGWIN:#define __WCHAR_TYPE__ unsigned short
+// I386-CYGWIN:#define __WCHAR_WIDTH__ 16
+// I386-CYGWIN:#define __WINT_TYPE__ unsigned int
+// I386-CYGWIN:#define __WINT_WIDTH__ 32
+// I386-CYGWIN:#define __i386 1
+// I386-CYGWIN:#define __i386__ 1
+// I386-CYGWIN:#define __unix 1
+// I386-CYGWIN:#define __unix__ 1
+// I386-CYGWIN:#define i386 1
+// I386-CYGWIN:#define unix 1
+
+// RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=x86_64-pc-windows-cygnus < /dev/null | FileCheck -match-full-lines -check-prefix X86_64-CYGWIN %s
+// X86_64-CYGWIN:#define _LP64 1
+// X86_64-CYGWIN:#define __BIGGEST_ALIGNMENT__ 16
+// X86_64-CYGWIN:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+// X86_64-CYGWIN:#define __CHAR16_TYPE__ unsigned short
+// X86_64-CYGWIN:#define __CHAR32_TYPE__ unsigned int
+// X86_64-CYGWIN:#define __CHAR_BIT__ 8
+// X86_64-CYGWIN:#define __CYGWIN__ 1
+// X86_64-CYGWIN:#define __DBL_DENORM_MIN__ 4.9406564584124654e-324
+// X86_64-CYGWIN:#define __DBL_DIG__ 15
+// X86_64-CYGWIN:#define __DBL_EPSILON__ 2.2204460492503131e-16
+// X86_64-CYGWIN:#define __DBL_HAS_DENORM__ 1
+// X86_64-CYGWIN:#define __DBL_HAS_INFINITY__ 1
+// X86_64-CYGWIN:#define __DBL_HAS_QUIET_NAN__ 1
+// X86_64-CYGWIN:#define __DBL_MANT_DIG__ 53
+// X86_64-CYGWIN:#define __DBL_MAX_10_EXP__ 308
+// X86_64-CYGWIN:#define __DBL_MAX_EXP__ 1024
+// X86_64-CYGWIN:#define __DBL_MAX__ 1.7976931348623157e+308
+// X86_64-CYGWIN:#define __DBL_MIN_10_EXP__ (-307)
+// X86_64-CYGWIN:#define __DBL_MIN_EXP__ (-1021)
+// X86_64-CYGWIN:#define __DBL_MIN__ 2.2250738585072014e-308
+// X86_64-CYGWIN:#define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__
+// X86_64-CYGWIN:#define __FLT_DENORM_MIN__ 1.40129846e-45F
+// X86_64-CYGWIN:#define __FLT_DIG__ 6
+// X86_64-CYGWIN:#define __FLT_EPSILON__ 1.19209290e-7F
+// X86_64-CYGWIN:#define __FLT_HAS_DENORM__ 1
+// X86_64-CYGWIN:#define __FLT_HAS_INFINITY__ 1
+// X86_64-CYGWIN:#define __FLT_HAS_QUIET_NAN__ 1
+// X86_64-CYGWIN:#define __FLT_MANT_DIG__ 24
+// X86_64-CYGWIN:#define __FLT_MAX_10_EXP__ 38
+// X86_64-CYGWIN:#define __FLT_MAX_EXP__ 128
+// X86_64-CYGWIN:#define __FLT_MAX__ 3.40282347e+38F
+// X86_64-CYGWIN:#define __FLT_MIN_10_EXP__ (-37)
+// X86_64-CYGWIN:#define __FLT_MIN_EXP__ (-125)
+// X86_64-CYGWIN:#define __FLT_MIN__ 1.17549435e-38F
+// X86_64-CYGWIN:#define __FLT_RADIX__ 2
+// X86_64-CYGWIN:#define __GCC_ATOMIC_BOOL_LOCK_FREE 2
+// X86_64-CYGWIN:#define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2
+// X86_64-CYGWIN:#define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2
+// X86_64-CYGWIN:#define __GCC_ATOMIC_CHAR_LOCK_FREE 2
+// X86_64-CYGWIN:#define __GCC_ATOMIC_INT_LOCK_FREE 2
+// X86_64-CYGWIN:#define __GCC_ATOMIC_LLONG_LOCK_FREE 2
+// X86_64-CYGWIN:#define __GCC_ATOMIC_LONG_LOCK_FREE 2
+// X86_64-CYGWIN:#define __GCC_ATOMIC_POINTER_LOCK_FREE 2
+// X86_64-CYGWIN:#define __GCC_ATOMIC_SHORT_LOCK_FREE 2
+// X86_64-CYGWIN:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
+// X86_64-CYGWIN:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2
+// X86_64-CYGWIN:#define __INT16_C(c) c
+// X86_64-CYGWIN:#define __INT16_C_SUFFIX__
+// X86_64-CYGWIN:#define __INT16_FMTd__ "hd"
+// X86_64-CYGWIN:#define __INT16_FMTi__ "hi"
+// X86_64-CYGWIN:#define __INT16_MAX__ 32767
+// X86_64-CYGWIN:#define __INT16_TYPE__ short
+// X86_64-CYGWIN:#define __INT32_C(c) c
+// X86_64-CYGWIN:#define __INT32_C_SUFFIX__
+// X86_64-CYGWIN:#define __INT32_FMTd__ "d"
+// X86_64-CYGWIN:#define __INT32_FMTi__ "i"
+// X86_64-CYGWIN:#define __INT32_MAX__ 2147483647
+// X86_64-CYGWIN:#define __INT32_TYPE__ int
+// X86_64-CYGWIN:#define __INT64_C(c) c##L
+// X86_64-CYGWIN:#define __INT64_C_SUFFIX__ L
+// X86_64-CYGWIN:#define __INT64_FMTd__ "ld"
+// X86_64-CYGWIN:#define __INT64_FMTi__ "li"
+// X86_64-CYGWIN:#define __INT64_MAX__ 9223372036854775807L
+// X86_64-CYGWIN:#define __INT64_TYPE__ long int
+// X86_64-CYGWIN:#define __INT8_C(c) c
+// X86_64-CYGWIN:#define __INT8_C_SUFFIX__
+// X86_64-CYGWIN:#define __INT8_FMTd__ "hhd"
+// X86_64-CYGWIN:#define __INT8_FMTi__ "hhi"
+// X86_64-CYGWIN:#define __INT8_MAX__ 127
+// X86_64-CYGWIN:#define __INT8_TYPE__ signed char
+// X86_64-CYGWIN:#define __INTMAX_C(c) c##L
+// X86_64-CYGWIN:#define __INTMAX_C_SUFFIX__ L
+// X86_64-CYGWIN:#define __INTMAX_FMTd__ "ld"
+// X86_64-CYGWIN:#define __INTMAX_FMTi__ "li"
+// X86_64-CYGWIN:#define __INTMAX_MAX__ 9223372036854775807L
+// X86_64-CYGWIN:#define __INTMAX_TYPE__ long int
+// X86_64-CYGWIN:#define __INTMAX_WIDTH__ 64
+// X86_64-CYGWIN:#define __INTPTR_FMTd__ "ld"
+// X86_64-CYGWIN:#define __INTPTR_FMTi__ "li"
+// X86_64-CYGWIN:#define __INTPTR_MAX__ 9223372036854775807L
+// X86_64-CYGWIN:#define __INTPTR_TYPE__ long int
+// X86_64-CYGWIN:#define __INTPTR_WIDTH__ 64
+// X86_64-CYGWIN:#define __INT_FAST16_FMTd__ "hd"
+// X86_64-CYGWIN:#define __INT_FAST16_FMTi__ "hi"
+// X86_64-CYGWIN:#define __INT_FAST16_MAX__ 32767
+// X86_64-CYGWIN:#define __INT_FAST16_TYPE__ short
+// X86_64-CYGWIN:#define __INT_FAST32_FMTd__ "d"
+// X86_64-CYGWIN:#define __INT_FAST32_FMTi__ "i"
+// X86_64-CYGWIN:#define __INT_FAST32_MAX__ 2147483647
+// X86_64-CYGWIN:#define __INT_FAST32_TYPE__ int
+// X86_64-CYGWIN:#define __INT_FAST64_FMTd__ "ld"
+// X86_64-CYGWIN:#define __INT_FAST64_FMTi__ "li"
+// X86_64-CYGWIN:#define __INT_FAST64_MAX__ 9223372036854775807L
+// X86_64-CYGWIN:#define __INT_FAST64_TYPE__ long int
+// X86_64-CYGWIN:#define __INT_FAST8_FMTd__ "hhd"
+// X86_64-CYGWIN:#define __INT_FAST8_FMTi__ "hhi"
+// X86_64-CYGWIN:#define __INT_FAST8_MAX__ 127
+// X86_64-CYGWIN:#define __INT_FAST8_TYPE__ signed char
+// X86_64-CYGWIN:#define __INT_LEAST16_FMTd__ "hd"
+// X86_64-CYGWIN:#define __INT_LEAST16_FMTi__ "hi"
+// X86_64-CYGWIN:#define __INT_LEAST16_MAX__ 32767
+// X86_64-CYGWIN:#define __INT_LEAST16_TYPE__ short
+// X86_64-CYGWIN:#define __INT_LEAST32_FMTd__ "d"
+// X86_64-CYGWIN:#define __INT_LEAST32_FMTi__ "i"
+// X86_64-CYGWIN:#define __INT_LEAST32_MAX__ 2147483647
+// X86_64-CYGWIN:#define __INT_LEAST32_TYPE__ int
+// X86_64-CYGWIN:#define __INT_LEAST64_FMTd__ "ld"
+// X86_64-CYGWIN:#define __INT_LEAST64_FMTi__ "li"
+// X86_64-CYGWIN:#define __INT_LEAST64_MAX__ 9223372036854775807L
+// X86_64-CYGWIN:#define __INT_LEAST64_TYPE__ long int
+// X86_64-CYGWIN:#define __INT_LEAST8_FMTd__ "hhd"
+// X86_64-CYGWIN:#define __INT_LEAST8_FMTi__ "hhi"
+// X86_64-CYGWIN:#define __INT_LEAST8_MAX__ 127
+// X86_64-CYGWIN:#define __INT_LEAST8_TYPE__ signed char
+// X86_64-CYGWIN:#define __INT_MAX__ 2147483647
+// X86_64-CYGWIN:#define __LDBL_DENORM_MIN__ 3.64519953188247460253e-4951L
+// X86_64-CYGWIN:#define __LDBL_DIG__ 18
+// X86_64-CYGWIN:#define __LDBL_EPSILON__ 1.08420217248550443401e-19L
+// X86_64-CYGWIN:#define __LDBL_HAS_DENORM__ 1
+// X86_64-CYGWIN:#define __LDBL_HAS_INFINITY__ 1
+// X86_64-CYGWIN:#define __LDBL_HAS_QUIET_NAN__ 1
+// X86_64-CYGWIN:#define __LDBL_MANT_DIG__ 64
+// X86_64-CYGWIN:#define __LDBL_MAX_10_EXP__ 4932
+// X86_64-CYGWIN:#define __LDBL_MAX_EXP__ 16384
+// X86_64-CYGWIN:#define __LDBL_MAX__ 1.18973149535723176502e+4932L
+// X86_64-CYGWIN:#define __LDBL_MIN_10_EXP__ (-4931)
+// X86_64-CYGWIN:#define __LDBL_MIN_EXP__ (-16381)
+// X86_64-CYGWIN:#define __LDBL_MIN__ 3.36210314311209350626e-4932L
+// X86_64-CYGWIN:#define __LITTLE_ENDIAN__ 1
+// X86_64-CYGWIN:#define __LONG_LONG_MAX__ 9223372036854775807LL
+// X86_64-CYGWIN:#define __LONG_MAX__ 9223372036854775807L
+// X86_64-CYGWIN:#define __LP64__ 1
+// X86_64-CYGWIN:#define __MMX__ 1
+// X86_64-CYGWIN:#define __NO_MATH_INLINES 1
+// X86_64-CYGWIN:#define __POINTER_WIDTH__ 64
+// X86_64-CYGWIN:#define __PTRDIFF_TYPE__ long int
+// X86_64-CYGWIN:#define __PTRDIFF_WIDTH__ 64
+// X86_64-CYGWIN:#define __REGISTER_PREFIX__
+// X86_64-CYGWIN:#define __SCHAR_MAX__ 127
+// X86_64-CYGWIN:#define __SHRT_MAX__ 32767
+// X86_64-CYGWIN:#define __SIG_ATOMIC_MAX__ 2147483647
+// X86_64-CYGWIN:#define __SIG_ATOMIC_WIDTH__ 32
+// X86_64-CYGWIN:#define __SIZEOF_DOUBLE__ 8
+// X86_64-CYGWIN:#define __SIZEOF_FLOAT__ 4
+// X86_64-CYGWIN:#define __SIZEOF_INT__ 4
+// X86_64-CYGWIN:#define __SIZEOF_LONG_DOUBLE__ 16
+// X86_64-CYGWIN:#define __SIZEOF_LONG_LONG__ 8
+// X86_64-CYGWIN:#define __SIZEOF_LONG__ 8
+// X86_64-CYGWIN:#define __SIZEOF_POINTER__ 8
+// X86_64-CYGWIN:#define __SIZEOF_PTRDIFF_T__ 8
+// X86_64-CYGWIN:#define __SIZEOF_SHORT__ 2
+// X86_64-CYGWIN:#define __SIZEOF_SIZE_T__ 8
+// X86_64-CYGWIN:#define __SIZEOF_WCHAR_T__ 2
+// X86_64-CYGWIN:#define __SIZEOF_WINT_T__ 4
+// X86_64-CYGWIN:#define __SIZE_MAX__ 18446744073709551615UL
+// X86_64-CYGWIN:#define __SIZE_TYPE__ long unsigned int
+// X86_64-CYGWIN:#define __SIZE_WIDTH__ 64
+// X86_64-CYGWIN:#define __SSE2_MATH__ 1
+// X86_64-CYGWIN:#define __SSE2__ 1
+// X86_64-CYGWIN:#define __SSE_MATH__ 1
+// X86_64-CYGWIN:#define __SSE__ 1
+// X86_64-CYGWIN:#define __UINT16_C(c) c
+// X86_64-CYGWIN:#define __UINT16_C_SUFFIX__
+// X86_64-CYGWIN:#define __UINT16_MAX__ 65535
+// X86_64-CYGWIN:#define __UINT16_TYPE__ unsigned short
+// X86_64-CYGWIN:#define __UINT32_C(c) c##U
+// X86_64-CYGWIN:#define __UINT32_C_SUFFIX__ U
+// X86_64-CYGWIN:#define __UINT32_MAX__ 4294967295U
+// X86_64-CYGWIN:#define __UINT32_TYPE__ unsigned int
+// X86_64-CYGWIN:#define __UINT64_C(c) c##UL
+// X86_64-CYGWIN:#define __UINT64_C_SUFFIX__ UL
+// X86_64-CYGWIN:#define __UINT64_MAX__ 18446744073709551615UL
+// X86_64-CYGWIN:#define __UINT64_TYPE__ long unsigned int
+// X86_64-CYGWIN:#define __UINT8_C(c) c
+// X86_64-CYGWIN:#define __UINT8_C_SUFFIX__
+// X86_64-CYGWIN:#define __UINT8_MAX__ 255
+// X86_64-CYGWIN:#define __UINT8_TYPE__ unsigned char
+// X86_64-CYGWIN:#define __UINTMAX_C(c) c##UL
+// X86_64-CYGWIN:#define __UINTMAX_C_SUFFIX__ UL
+// X86_64-CYGWIN:#define __UINTMAX_MAX__ 18446744073709551615UL
+// X86_64-CYGWIN:#define __UINTMAX_TYPE__ long unsigned int
+// X86_64-CYGWIN:#define __UINTMAX_WIDTH__ 64
+// X86_64-CYGWIN:#define __UINTPTR_MAX__ 18446744073709551615UL
+// X86_64-CYGWIN:#define __UINTPTR_TYPE__ long unsigned int
+// X86_64-CYGWIN:#define __UINTPTR_WIDTH__ 64
+// X86_64-CYGWIN:#define __UINT_FAST16_MAX__ 65535
+// X86_64-CYGWIN:#define __UINT_FAST16_TYPE__ unsigned short
+// X86_64-CYGWIN:#define __UINT_FAST32_MAX__ 4294967295U
+// X86_64-CYGWIN:#define __UINT_FAST32_TYPE__ unsigned int
+// X86_64-CYGWIN:#define __UINT_FAST64_MAX__ 18446744073709551615UL
+// X86_64-CYGWIN:#define __UINT_FAST64_TYPE__ long unsigned int
+// X86_64-CYGWIN:#define __UINT_FAST8_MAX__ 255
+// X86_64-CYGWIN:#define __UINT_FAST8_TYPE__ unsigned char
+// X86_64-CYGWIN:#define __UINT_LEAST16_MAX__ 65535
+// X86_64-CYGWIN:#define __UINT_LEAST16_TYPE__ unsigned short
+// X86_64-CYGWIN:#define __UINT_LEAST32_MAX__ 4294967295U
+// X86_64-CYGWIN:#define __UINT_LEAST32_TYPE__ unsigned int
+// X86_64-CYGWIN:#define __UINT_LEAST64_MAX__ 18446744073709551615UL
+// X86_64-CYGWIN:#define __UINT_LEAST64_TYPE__ long unsigned int
+// X86_64-CYGWIN:#define __UINT_LEAST8_MAX__ 255
+// X86_64-CYGWIN:#define __UINT_LEAST8_TYPE__ unsigned char
+// X86_64-CYGWIN:#define __USER_LABEL_PREFIX__
+// X86_64-CYGWIN:#define __WCHAR_MAX__ 65535
+// X86_64-CYGWIN:#define __WCHAR_TYPE__ unsigned short
+// X86_64-CYGWIN:#define __WCHAR_WIDTH__ 16
+// X86_64-CYGWIN:#define __WINT_TYPE__ unsigned int
+// X86_64-CYGWIN:#define __WINT_WIDTH__ 32
+// X86_64-CYGWIN:#define __amd64 1
+// X86_64-CYGWIN:#define __amd64__ 1
+// X86_64-CYGWIN:#define __unix 1
+// X86_64-CYGWIN:#define __unix__ 1
+// X86_64-CYGWIN:#define __x86_64 1
+// X86_64-CYGWIN:#define __x86_64__ 1
+// X86_64-CYGWIN:#define unix 1
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index 4c8d519..031a6c1 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -2090,12 +2090,6 @@
 // WEBASSEMBLY-CXX-ATOMICS:#define _REENTRANT 1
 // WEBASSEMBLY-CXX-ATOMICS:#define __STDCPP_THREADS__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple i686-windows-cygnus < /dev/null | FileCheck -match-full-lines -check-prefix CYGWIN-X32 %s
-// CYGWIN-X32: #define __USER_LABEL_PREFIX__ _
-
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple x86_64-windows-cygnus < /dev/null | FileCheck -match-full-lines -check-prefix CYGWIN-X64 %s
-// CYGWIN-X64: #define __USER_LABEL_PREFIX__
-
 // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=avr \
 // RUN:   < /dev/null \
 // RUN:   | FileCheck -match-full-lines -check-prefix=AVR %s
diff --git a/clang/test/Sema/warn-unreachable_crash.cpp b/clang/test/Sema/warn-unreachable_crash.cpp
new file mode 100644
index 0000000..628abcc
--- /dev/null
+++ b/clang/test/Sema/warn-unreachable_crash.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -verify -Wunreachable-code %s
+
+// Previously this test will crash
+static void test(__fp16& x) {
+  if (x != 0 || x != 1.0) { // expected-note{{}} no-crash
+      x = 0.9;
+    } else
+      x = 0.8; // expected-warning{{code will never be executed}}
+}
+
+static void test2(__fp16& x) {
+  if (x != 1 && x == 1.0) { // expected-note{{}} no-crash
+      x = 0.9; // expected-warning{{code will never be executed}}
+    } else
+      x = 0.8;
+}
diff --git a/clang/test/SemaCUDA/bad-attributes.cu b/clang/test/SemaCUDA/bad-attributes.cu
index 8ac6fa1..acb2f38 100644
--- a/clang/test/SemaCUDA/bad-attributes.cu
+++ b/clang/test/SemaCUDA/bad-attributes.cu
@@ -50,6 +50,14 @@ __global__ __device__ void z11();  // expected-error {{attributes are not compat
 __global__ __host__ void z12();  // expected-error {{attributes are not compatible}}
 // expected-note@-1 {{conflicting attribute is here}}
 
+// Make sure GPU-side variables do not allow __attribute((address_space(N)))
+// expected-error@+1 {{__constant__, __device__, and __shared__ variables must use default address space}}
+__shared__ __attribute__((address_space(999))) int as_s;
+// expected-error@+1 {{__constant__, __device__, and __shared__ variables must use default address space}}
+__device__ __attribute__((address_space(999))) int as_d;
+// expected-error@+1 {{__constant__, __device__, and __shared__ variables must use default address space}}
+__constant__ __attribute__((address_space(999))) int as_c;
+
 struct S {
   __global__ void foo() {};  // expected-error {{must be a free function or static member function}}
   __global__ static void bar(); // expected-warning {{kernel function 'bar' is a member function}}
diff --git a/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp b/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp
index 062becd..aff172e 100644
--- a/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp
+++ b/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp
@@ -333,6 +333,31 @@ struct CopyAssign1 {
    CopyAssign1 & operator=(CopyAssign1 const &) = default;
 };
 
+struct UserDeleted1 {
+    UserDeleted1(const UserDeleted1&) = delete;
+};
+static_assert(!__builtin_is_cpp_trivially_relocatable(UserDeleted1));
+static_assert(!__builtin_is_replaceable(UserDeleted1));
+
+struct UserDeleted2 {
+    UserDeleted2(UserDeleted2&&) = delete;
+};
+static_assert(!__builtin_is_cpp_trivially_relocatable(UserDeleted2));
+static_assert(!__builtin_is_replaceable(UserDeleted2));
+
+
+struct UserDeleted3 {
+    UserDeleted3 operator=(UserDeleted3);
+};
+static_assert(!__builtin_is_cpp_trivially_relocatable(UserDeleted3));
+static_assert(!__builtin_is_replaceable(UserDeleted3));
+
+struct UserDeleted4 {
+    UserDeleted4 operator=(UserDeleted4&&);
+};
+static_assert(!__builtin_is_cpp_trivially_relocatable(UserDeleted4));
+static_assert(!__builtin_is_replaceable(UserDeleted4));
+
 }
 
 
diff --git a/clang/test/SemaCXX/destructor.cpp b/clang/test/SemaCXX/destructor.cpp
index 589616e..ed48029 100644
--- a/clang/test/SemaCXX/destructor.cpp
+++ b/clang/test/SemaCXX/destructor.cpp
@@ -58,7 +58,7 @@ struct D {
 
 struct D2 {
   void ~D2() { } //                          \
-  // expected-error{{destructor cannot have a return type}}  
+  // expected-error{{destructor cannot have a return type}}
 };
 
 
@@ -86,7 +86,7 @@ struct G {
 G::~G() { }
 
 struct H {
-  ~H(void) { } 
+  ~H(void) { }
 };
 
 struct X {};
@@ -103,7 +103,7 @@ namespace PR6421 {
     template<typename U>
     void foo(T t) // expected-error{{variable has incomplete type}}
     { }
-    
+
     void disconnect()
     {
       T* t;
@@ -364,7 +364,7 @@ struct __is_destructor_wellformed {
                        decltype(_Tp1().~_Tp1())>::type);
   template <typename _Tp1>
   static __two __test (...);
-              
+
   static const bool value = sizeof(__test<_Tp>(12)) == sizeof(char);
 };
 
diff --git a/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp b/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp
index 498e202..329b611 100644
--- a/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp
+++ b/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp
@@ -171,3 +171,24 @@ void test() {
     // expected-note@#concept4 {{because it is a reference type}}
 }
 }
+
+
+namespace std {
+template <typename T>
+struct is_replaceable {
+    static constexpr bool value = __builtin_is_replaceable(T);
+};
+
+template <typename T>
+constexpr bool is_replaceable_v = __builtin_is_replaceable(T);
+
+}
+
+static_assert(std::is_replaceable<int&>::value);
+// expected-error@-1 {{static assertion failed due to requirement 'std::is_replaceable<int &>::value'}} \
+// expected-note@-1 {{'int &' is not replaceable}} \
+// expected-note@-1 {{because it is a reference type}}
+static_assert(std::is_replaceable_v<int&>);
+// expected-error@-1 {{static assertion failed due to requirement 'std::is_replaceable_v<int &>'}} \
+// expected-note@-1 {{'int &' is not replaceable}} \
+// expected-note@-1 {{because it is a reference type}}
diff --git a/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp b/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
index 0256569..9e05303 100644
--- a/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
+++ b/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
@@ -142,6 +142,181 @@ static_assert(__builtin_is_cpp_trivially_relocatable(U2));
 // expected-note@-1 {{because it has a deleted destructor}} \
 // expected-note@-1 {{because it has a non-trivially-relocatable member 'b' of type 'B'}} \
 // expected-note@#tr-U2 {{'U2' defined here}}
+}
+
+namespace replaceable {
+
+extern int vla_size;
+static_assert(__builtin_is_replaceable(int[vla_size]));
+// expected-error@-1 {{static assertion failed due to requirement '__builtin_is_replaceable(int[vla_size])'}} \
+// expected-note@-1 {{'int[vla_size]' is not replaceable}} \
+// expected-note@-1 {{because it is a variably-modified type}}
+
+struct S; // expected-note {{forward declaration of 'replaceable::S'}}
+static_assert(__builtin_is_replaceable(S));
+// expected-error@-1 {{incomplete type 'S' used in type trait expression}}
+
+static_assert(__builtin_is_replaceable(const volatile int));
+// expected-error@-1 {{static assertion failed due to requirement '__builtin_is_replaceable(const volatile int)}} \
+// expected-note@-1 {{'const volatile int' is not replaceable}} \
+// expected-note@-1 {{because it is const}} \
+// expected-note@-1 {{because it is volatile}}
+
+
+static_assert(__builtin_is_replaceable(void()));
+// expected-error@-1 {{static assertion failed due to requirement '__builtin_is_replaceable(void ())}} \
+// expected-note@-1 {{'void ()' is not replaceable}} \
+// expected-note@-1 {{because it not a scalar or class type}}
+
+struct B {
+ virtual ~B();
+};
+struct S : virtual B { // #replaceable-S
+    S();
+    int & a;
+    const int ci;
+    B & b;
+    B c;
+    ~S();
+};
+static_assert(__builtin_is_replaceable(S));
+// expected-error@-1 {{static assertion failed due to requirement '__builtin_is_replaceable(replaceable::S)'}} \
+// expected-note@-1 {{'S' is not replaceable}} \
+// expected-note@-1 {{because it has a non-replaceable base 'B'}} \
+// expected-note@-1 {{because it has a non-replaceable member 'a' of type 'int &'}} \
+// expected-note@-1 {{because it has a non-replaceable member 'ci' of type 'const int'}} \
+// expected-note@-1 {{because it has a non-replaceable member 'b' of type 'B &'}} \
+// expected-note@-1 {{because it has a non-replaceable member 'c' of type 'B'}} \
+// expected-note@-1 {{because it has a user-provided destructor}} \
+// expected-note@-1 {{because it has a deleted copy assignment operator}}
+// expected-note@#replaceable-S {{'S' defined here}}
+
+struct S2 { // #replaceable-S2
+    S2(S2&&);
+    S2& operator=(const S2&);
+};
+static_assert(__builtin_is_replaceable(S2));
+// expected-error@-1 {{static assertion failed due to requirement '__builtin_is_replaceable(replaceable::S2)'}} \
+// expected-note@-1 {{'S2' is not replaceable}} \
+// expected-note@-1 {{because it has a user provided move constructor}} \
+// expected-note@-1 {{because it has a user provided copy assignment operator}} \
+// expected-note@#replaceable-S2 {{'S2' defined here}}
+
+
+struct S3 { // #replaceable-S3
+    ~S3() = delete;
+};
+static_assert(__builtin_is_replaceable(S3));
+// expected-error@-1 {{static assertion failed due to requirement '__builtin_is_replaceable(replaceable::S3)'}} \
+// expected-note@-1 {{'S3' is not replaceable}} \
+// expected-note@-1 {{because it has a deleted destructor}} \
+// expected-note@#replaceable-S3 {{'S3' defined here}}
+
+
+union U { // #replaceable-U
+    U(const U&);
+    U(U&&);
+    U& operator=(const U&);
+    U& operator=(U&&);
+};
+static_assert(__builtin_is_replaceable(U));
+// expected-error@-1 {{static assertion failed due to requirement '__builtin_is_replaceable(replaceable::U)'}} \
+// expected-note@-1 {{'U' is not replaceable}} \
+// expected-note@-1 {{because it is a union with a user-declared copy constructor}} \
+// expected-note@-1 {{because it is a union with a user-declared copy assignment operator}} \
+// expected-note@-1 {{because it is a union with a user-declared move constructor}} \
+// expected-note@-1 {{because it is a union with a user-declared move assignment operator}}
+// expected-note@#replaceable-U {{'U' defined here}}
+struct S4 replaceable_if_eligible { // #replaceable-S4
+    ~S4();
+    B b;
+};
+static_assert(__builtin_is_replaceable(S4));
+// expected-error@-1 {{static assertion failed due to requirement '__builtin_is_replaceable(replaceable::S4)'}} \
+// expected-note@-1 {{'S4' is not replaceable}} \
+// expected-note@-1 {{because it has a non-replaceable member 'b' of type 'B'}} \
+// expected-note@#replaceable-S4 {{'S4' defined here}}
+
+union U2 replaceable_if_eligible { // #replaceable-U2
+    U2(const U2&);
+    U2(U2&&);
+    B b;
+};
+static_assert(__builtin_is_replaceable(U2));
+// expected-error@-1 {{static assertion failed due to requirement '__builtin_is_replaceable(replaceable::U2)'}} \
+// expected-note@-1 {{'U2' is not replaceable}} \
+// expected-note@-1 {{because it has a deleted destructor}} \
+// expected-note@-1 {{because it has a non-replaceable member 'b' of type 'B'}} \
+// expected-note@-1 {{because it has a deleted copy assignment operator}} \
+// expected-note@#replaceable-U2 {{'U2' defined here}}
+
+struct UD1 {  // #replaceable-UD1
+    UD1(const UD1&) = delete;
+    UD1 & operator=(const UD1&) = delete;
+
+};
+static_assert(__builtin_is_replaceable(UD1));
+// expected-error@-1 {{static assertion failed due to requirement '__builtin_is_replaceable(replaceable::UD1)'}} \
+// expected-note@-1 {{'UD1' is not replaceable}} \
+// expected-note@-1 {{because it has a deleted copy constructor}} \
+// expected-note@-1 {{because it has a deleted copy assignment operator}} \
+// expected-note@#replaceable-UD1 {{'UD1' defined here}}
+
+
+struct UD2 {  // #replaceable-UD2
+    UD2(UD2&&) = delete;
+    UD2 & operator=(UD2&&) = delete;
+};
+static_assert(__builtin_is_replaceable(UD2));
+// expected-error@-1 {{static assertion failed due to requirement '__builtin_is_replaceable(replaceable::UD2)'}} \
+// expected-note@-1 {{'UD2' is not replaceable}} \
+// expected-note@-1 {{because it has a deleted move constructor}} \
+// expected-note@-1 {{because it has a deleted move assignment operator}} \
+// expected-note@#replaceable-UD2 {{'UD2' defined here}}
+
+}
+
+
+namespace GH143325 {
+struct Foo  { // expected-note {{previous definition is here}}
+  Foo(const Foo&);
+  ~Foo();
+};
+
+struct Foo { // expected-error {{redefinition of 'Foo'}}
+  Foo();
+  int;
+};
+struct Wrapper { // #GH143325-Wrapper
+  union {
+    Foo p;
+  } u;
+};
+
+static_assert(__builtin_is_cpp_trivially_relocatable(Wrapper));
+// expected-error@-1 {{static assertion failed due to requirement '__builtin_is_cpp_trivially_relocatable(GH143325::Wrapper)'}} \
+// expected-note@-1 {{'Wrapper' is not trivially relocatable}} \
+// expected-note@-1 {{because it has a non-trivially-relocatable member 'u' of type 'union}} \
+// expected-note@-1 {{because it has a deleted destructor}}
+// expected-note@#GH143325-Wrapper {{'Wrapper' defined here}}
+
+struct Polymorphic  {
+  virtual ~Polymorphic();
+};
+
+struct UnionOfPolymorphic { // #GH143325-UnionOfPolymorphic
+  union {
+    Polymorphic p;
+    int i;
+  } u;
+};
+
+static_assert(__builtin_is_cpp_trivially_relocatable(UnionOfPolymorphic));
+// expected-error@-1 {{static assertion failed due to requirement '__builtin_is_cpp_trivially_relocatable(GH143325::UnionOfPolymorphic)'}} \
+// expected-note@-1 {{'UnionOfPolymorphic' is not trivially relocatable}} \
+// expected-note@-1 {{because it has a non-trivially-relocatable member 'u' of type 'union}} \
+// expected-note@-1 {{because it has a deleted destructor}} \
+// expected-note@#GH143325-UnionOfPolymorphic {{'UnionOfPolymorphic' defined here}}
 
 }
 
diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp
index ad12672..b22d3aa 100644
--- a/clang/tools/clang-format/ClangFormat.cpp
+++ b/clang/tools/clang-format/ClangFormat.cpp
@@ -244,17 +244,17 @@ static bool fillRanges(MemoryBuffer *Code,
   DiagnosticsEngine Diagnostics(
       IntrusiveRefCntPtr<DiagnosticIDs>(new DiagnosticIDs), DiagOpts);
   SourceManager Sources(Diagnostics, Files);
-  const auto ID = createInMemoryFile("<irrelevant>", *Code, Sources, Files,
-                                     InMemoryFileSystem.get());
+  FileID ID = createInMemoryFile("<irrelevant>", *Code, Sources, Files,
+                                 InMemoryFileSystem.get());
   if (!LineRanges.empty()) {
     if (!Offsets.empty() || !Lengths.empty()) {
       errs() << "error: cannot use -lines with -offset/-length\n";
       return true;
     }
 
-    for (const auto &LineRange : LineRanges) {
+    for (unsigned i = 0, e = LineRanges.size(); i < e; ++i) {
       unsigned FromLine, ToLine;
-      if (parseLineRange(LineRange, FromLine, ToLine)) {
+      if (parseLineRange(LineRanges[i], FromLine, ToLine)) {
         errs() << "error: invalid <start line>:<end line> pair\n";
         return true;
       }
@@ -266,12 +266,12 @@ static bool fillRanges(MemoryBuffer *Code,
         errs() << "error: start line should not exceed end line\n";
         return true;
       }
-      const auto Start = Sources.translateLineCol(ID, FromLine, 1);
-      const auto End = Sources.translateLineCol(ID, ToLine, UINT_MAX);
+      SourceLocation Start = Sources.translateLineCol(ID, FromLine, 1);
+      SourceLocation End = Sources.translateLineCol(ID, ToLine, UINT_MAX);
       if (Start.isInvalid() || End.isInvalid())
         return true;
-      const auto Offset = Sources.getFileOffset(Start);
-      const auto Length = Sources.getFileOffset(End) - Offset;
+      unsigned Offset = Sources.getFileOffset(Start);
+      unsigned Length = Sources.getFileOffset(End) - Offset;
       Ranges.push_back(tooling::Range(Offset, Length));
     }
     return false;
@@ -279,25 +279,32 @@ static bool fillRanges(MemoryBuffer *Code,
 
   if (Offsets.empty())
     Offsets.push_back(0);
-  if (Offsets.size() == 1 && Lengths.empty()) {
-    Lengths.push_back(Sources.getFileOffset(Sources.getLocForEndOfFile(ID)) -
-                      Offsets[0]);
-  } else if (Offsets.size() != Lengths.size()) {
+  if (Offsets.size() != Lengths.size() &&
+      !(Offsets.size() == 1 && Lengths.empty())) {
     errs() << "error: number of -offset and -length arguments must match.\n";
     return true;
   }
-  for (unsigned I = 0, E = Offsets.size(); I < E; ++I) {
-    const auto Offset = Offsets[I];
-    if (Offset >= Code->getBufferSize()) {
-      errs() << "error: offset " << Offset << " is outside the file\n";
+  for (unsigned i = 0, e = Offsets.size(); i != e; ++i) {
+    if (Offsets[i] >= Code->getBufferSize()) {
+      errs() << "error: offset " << Offsets[i] << " is outside the file\n";
       return true;
     }
-    const auto Length = Lengths[I];
-    if (Offset + Length > Code->getBufferSize()) {
-      errs() << "error: invalid length " << Length << ", offset + length ("
-             << Offset + Length << ") is outside the file.\n";
-      return true;
+    SourceLocation Start =
+        Sources.getLocForStartOfFile(ID).getLocWithOffset(Offsets[i]);
+    SourceLocation End;
+    if (i < Lengths.size()) {
+      if (Offsets[i] + Lengths[i] > Code->getBufferSize()) {
+        errs() << "error: invalid length " << Lengths[i]
+               << ", offset + length (" << Offsets[i] + Lengths[i]
+               << ") is outside the file.\n";
+        return true;
+      }
+      End = Start.getLocWithOffset(Lengths[i]);
+    } else {
+      End = Sources.getLocForEndOfFile(ID);
     }
+    unsigned Offset = Sources.getFileOffset(Start);
+    unsigned Length = Sources.getFileOffset(End) - Offset;
     Ranges.push_back(tooling::Range(Offset, Length));
   }
   return false;
diff --git a/clang/unittests/DirectoryWatcher/DirectoryWatcherTest.cpp b/clang/unittests/DirectoryWatcher/DirectoryWatcherTest.cpp
index 4ad6f80..4cb2602 100644
--- a/clang/unittests/DirectoryWatcher/DirectoryWatcherTest.cpp
+++ b/clang/unittests/DirectoryWatcher/DirectoryWatcherTest.cpp
@@ -138,7 +138,7 @@ struct VerifyingConsumer {
 
   void consumeInitial(DirectoryWatcher::Event E) {
     std::unique_lock<std::mutex> L(Mtx);
-    auto It = std::find(ExpectedInitial.begin(), ExpectedInitial.end(), E);
+    auto It = llvm::find(ExpectedInitial, E);
     if (It == ExpectedInitial.end()) {
       UnexpectedInitial.push_back(E);
     } else {
@@ -152,11 +152,9 @@ struct VerifyingConsumer {
 
   void consumeNonInitial(DirectoryWatcher::Event E) {
     std::unique_lock<std::mutex> L(Mtx);
-    auto It =
-        std::find(ExpectedNonInitial.begin(), ExpectedNonInitial.end(), E);
+    auto It = llvm::find(ExpectedNonInitial, E);
     if (It == ExpectedNonInitial.end()) {
-      auto OptIt =
-          std::find(OptionalNonInitial.begin(), OptionalNonInitial.end(), E);
+      auto OptIt = llvm::find(OptionalNonInitial, E);
       if (OptIt != OptionalNonInitial.end()) {
         OptionalNonInitial.erase(OptIt);
       } else {
diff --git a/clang/unittests/Format/FormatTestRawStrings.cpp b/clang/unittests/Format/FormatTestRawStrings.cpp
index 3f09c7b..5e8737d6 100644
--- a/clang/unittests/Format/FormatTestRawStrings.cpp
+++ b/clang/unittests/Format/FormatTestRawStrings.cpp
@@ -1002,9 +1002,11 @@ TEST_F(FormatTestRawStrings, Json) {
   };
 
   EXPECT_EQ("json = R\"json({\n"
+            "                \"foo\": \"bar\",\n"
             "                \"str\": \"test\"\n"
             "              })json\";",
             format("json = R\"json({\n"
+                   "  \"foo\": \"bar\",\n"
                    "  \"str\": \"test\"\n"
                    "})json\";",
                    Style));
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index d64e34d..873c6c4 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -4105,6 +4105,20 @@ TEST_F(TokenAnnotatorTest, BitFieldColon) {
   EXPECT_TOKEN(Tokens[5], tok::colon, TT_BitFieldColon);
 }
 
+TEST_F(TokenAnnotatorTest, JsonCodeInRawString) {
+  auto Tokens = annotate("{\n"
+                         "  \"foo\": \"bar\",\n"
+                         "  \"str\": \"test\"\n"
+                         "}",
+                         getLLVMStyle(FormatStyle::LK_Json));
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_brace, TT_DictLiteral);
+  EXPECT_TOKEN(Tokens[1], tok::string_literal, TT_SelectorName);
+  EXPECT_TOKEN(Tokens[2], tok::colon, TT_DictLiteral);
+  EXPECT_TOKEN(Tokens[5], tok::string_literal, TT_SelectorName);
+  EXPECT_TOKEN(Tokens[6], tok::colon, TT_DictLiteral);
+}
+
 } // namespace
 } // namespace format
 } // namespace clang
diff --git a/clang/utils/analyzer/requirements.txt b/clang/utils/analyzer/requirements.txt
index 8ae8bc8..ed09161 100644
--- a/clang/utils/analyzer/requirements.txt
+++ b/clang/utils/analyzer/requirements.txt
@@ -1,6 +1,6 @@
 graphviz
 humanize
 matplotlib
-pandas
-psutil
+pandas>=1.0.4
+psutil>=5.6.6
 seaborn
diff --git a/clang/www/c_status.html b/clang/www/c_status.html
index e47466e..dcff2fc 100644
--- a/clang/www/c_status.html
+++ b/clang/www/c_status.html
@@ -864,7 +864,7 @@ conformance.</p>
     <tr>
       <td>Underspecified object definitions</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3006.htm">N3006</a></td>
-      <td class="none" align="center">No</td>
+      <td class="unreleased" align="center">Yes</td>
     </tr>
     <tr>
       <td>Type inference for object declarations</td>
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index d06eb4a..ad82d7a 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -14811,7 +14811,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/2496.html">2496</a></td>
     <td>CD6</td>
     <td><I>ref-qualifier</I>s and virtual overriding</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="unreleased" align="center">Clang 21</td>
   </tr>
   <tr class="open" id="2497">
     <td><a href="https://cplusplus.github.io/CWG/issues/2497.html">2497</a></td>
diff --git a/compiler-rt/lib/builtins/cpu_model/riscv.c b/compiler-rt/lib/builtins/cpu_model/riscv.c
index 4d0fda4..16d55fc 100644
--- a/compiler-rt/lib/builtins/cpu_model/riscv.c
+++ b/compiler-rt/lib/builtins/cpu_model/riscv.c
@@ -24,12 +24,18 @@ struct {
 // TODO: Maybe generate a header from tablegen then include it.
 #define A_GROUPID 0
 #define A_BITMASK (1ULL << 0)
+#define B_GROUPID 0
+#define B_BITMASK (1ULL << 1)
 #define C_GROUPID 0
 #define C_BITMASK (1ULL << 2)
 #define D_GROUPID 0
 #define D_BITMASK (1ULL << 3)
+#define E_GROUPID 0
+#define E_BITMASK (1ULL << 4)
 #define F_GROUPID 0
 #define F_BITMASK (1ULL << 5)
+#define H_GROUPID 0
+#define H_BITMASK (1ULL << 7)
 #define I_GROUPID 0
 #define I_BITMASK (1ULL << 8)
 #define M_GROUPID 0
diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
index 7c4d23a..a4bc3d6 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
@@ -3085,6 +3085,10 @@ void InitializeInterceptors() {
 #if !SANITIZER_ANDROID
   TSAN_INTERCEPT(dl_iterate_phdr);
 #endif
+
+  // Symbolization indirectly calls dl_iterate_phdr
+  ready_to_symbolize = true;
+
   TSAN_MAYBE_INTERCEPT_ON_EXIT;
   TSAN_INTERCEPT(__cxa_atexit);
   TSAN_INTERCEPT(_exit);
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
index c83efec..d8be212 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
@@ -679,6 +679,12 @@ void CheckUnwind() {
 
 bool is_initialized;
 
+// Symbolization indirectly calls dl_iterate_phdr. If a CHECK() fails early on
+// (prior to the dl_iterate_phdr interceptor setup), resulting in an attempted
+// symbolization, it will segfault.
+// dl_iterate_phdr is not intercepted for Android.
+bool ready_to_symbolize = SANITIZER_ANDROID;
+
 void Initialize(ThreadState *thr) {
   // Thread safe because done before all threads exist.
   if (is_initialized)
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.h b/compiler-rt/lib/tsan/rtl/tsan_rtl.h
index 4dc5e63..0be5359 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.h
@@ -54,6 +54,8 @@
 
 namespace __tsan {
 
+extern bool ready_to_symbolize;
+
 #if !SANITIZER_GO
 struct MapUnmapCallback;
 #  if defined(__mips64) || defined(__aarch64__) || defined(__loongarch__) || \
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp
index 51a98e2..0820bf1 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp
@@ -846,7 +846,16 @@ ALWAYS_INLINE USED void PrintCurrentStack(uptr pc, bool fast) {
     ptrace->trace_buffer[i] = ptrace->trace_buffer[ptrace->size - i - 1];
     ptrace->trace_buffer[ptrace->size - i - 1] = tmp;
   }
-  PrintStack(SymbolizeStack(*ptrace));
+
+  if (ready_to_symbolize) {
+    PrintStack(SymbolizeStack(*ptrace));
+  } else {
+    Printf(
+        "WARNING: PrintCurrentStack() has been called too early, before "
+        "symbolization is possible. Printing unsymbolized stack trace:\n");
+    for (unsigned int i = 0; i < ptrace->size; i++)
+      Printf("    #%u: 0x%zx\n", i, ptrace->trace[i]);
+  }
 #endif
 }
 
diff --git a/compiler-rt/test/cfi/CMakeLists.txt b/compiler-rt/test/cfi/CMakeLists.txt
index 98c328f..a02573d 100644
--- a/compiler-rt/test/cfi/CMakeLists.txt
+++ b/compiler-rt/test/cfi/CMakeLists.txt
@@ -47,7 +47,7 @@ foreach(arch ${CFI_TEST_ARCH})
   else()
     add_cfi_test_suites(False False)
     add_cfi_test_suites(False True)
-    if (COMPILER_RT_HAS_LLD AND NOT arch STREQUAL "i386")
+    if (COMPILER_RT_HAS_LLD)
 	    add_cfi_test_suites(True False)
 	    add_cfi_test_suites(True True)
     endif()
diff --git a/compiler-rt/test/lit.common.configured.in b/compiler-rt/test/lit.common.configured.in
index 04d1a4d..8ca47a8 100644
--- a/compiler-rt/test/lit.common.configured.in
+++ b/compiler-rt/test/lit.common.configured.in
@@ -25,6 +25,7 @@ set_default("gold_executable", "@GOLD_EXECUTABLE@")
 set_default("clang", "@COMPILER_RT_RESOLVED_TEST_COMPILER@")
 set_default("compiler_id", "@COMPILER_RT_TEST_COMPILER_ID@")
 set_default("python_executable", "@Python3_EXECUTABLE@")
+set_default("python_root_dir", "@Python3_ROOT_DIR@")
 set_default("compiler_rt_debug", @COMPILER_RT_DEBUG_PYBOOL@)
 set_default("compiler_rt_intercept_libdispatch", @COMPILER_RT_INTERCEPT_LIBDISPATCH_PYBOOL@)
 set_default("compiler_rt_output_dir", "@COMPILER_RT_RESOLVED_OUTPUT_DIR@")
diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
index cec212f..b319e2c 100644
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -30,6 +30,8 @@ class raw_ostream;
 }
 namespace Fortran::parser {
 struct Expr;
+struct OpenMPDeclareReductionConstruct;
+struct OmpMetadirectiveDirective;
 }
 
 namespace Fortran::semantics {
@@ -728,6 +730,40 @@ private:
 };
 llvm::raw_ostream &operator<<(llvm::raw_ostream &, const GenericDetails &);
 
+// Used for OpenMP DECLARE REDUCTION, it holds the information
+// needed to resolve which declaration (there could be multiple
+// with the same name) to use for a given type.
+class UserReductionDetails {
+public:
+  using TypeVector = std::vector<const DeclTypeSpec *>;
+  using DeclInfo = std::variant<const parser::OpenMPDeclareReductionConstruct *,
+      const parser::OmpMetadirectiveDirective *>;
+  using DeclVector = std::vector<DeclInfo>;
+
+  UserReductionDetails() = default;
+
+  void AddType(const DeclTypeSpec &type) { typeList_.push_back(&type); }
+  const TypeVector &GetTypeList() const { return typeList_; }
+
+  bool SupportsType(const DeclTypeSpec &type) const {
+    // We have to compare the actual type, not the pointer, as some
+    // types are not guaranteed to be the same object.
+    for (auto t : typeList_) {
+      if (*t == type) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void AddDecl(const DeclInfo &decl) { declList_.emplace_back(decl); }
+  const DeclVector &GetDeclList() const { return declList_; }
+
+private:
+  TypeVector typeList_;
+  DeclVector declList_;
+};
+
 class UnknownDetails {};
 
 using Details = std::variant<UnknownDetails, MainProgramDetails, ModuleDetails,
@@ -735,7 +771,7 @@ using Details = std::variant<UnknownDetails, MainProgramDetails, ModuleDetails,
     ObjectEntityDetails, ProcEntityDetails, AssocEntityDetails,
     DerivedTypeDetails, UseDetails, UseErrorDetails, HostAssocDetails,
     GenericDetails, ProcBindingDetails, NamelistDetails, CommonBlockDetails,
-    TypeParamDetails, MiscDetails>;
+    TypeParamDetails, MiscDetails, UserReductionDetails>;
 llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Details &);
 std::string DetailsToString(const Details &);
 
@@ -756,6 +792,7 @@ public:
       LocalityShared, // named in SHARED locality-spec
       InDataStmt, // initialized in a DATA statement, =>object, or /init/
       InNamelist, // in a Namelist group
+      InCommonBlock, // referenced in a common block
       EntryDummyArgument,
       CompilerCreated, // A compiler created symbol
       // For compiler created symbols that are constant but cannot legally have
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 0784a67..e0abe95 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -3368,4 +3368,12 @@ template void Unparse<Program>(llvm::raw_ostream &, const Program &,
 template void Unparse<Expr>(llvm::raw_ostream &, const Expr &,
     const common::LangOptions &, Encoding, bool, bool, preStatementType *,
     AnalyzedObjectsAsFortran *);
+
+template void Unparse<parser::OpenMPDeclareReductionConstruct>(
+    llvm::raw_ostream &, const parser::OpenMPDeclareReductionConstruct &,
+    const common::LangOptions &, Encoding, bool, bool, preStatementType *,
+    AnalyzedObjectsAsFortran *);
+template void Unparse<parser::OmpMetadirectiveDirective>(llvm::raw_ostream &,
+    const parser::OmpMetadirectiveDirective &, const common::LangOptions &,
+    Encoding, bool, bool, preStatementType *, AnalyzedObjectsAsFortran *);
 } // namespace Fortran::parser
diff --git a/flang/lib/Semantics/assignment.cpp b/flang/lib/Semantics/assignment.cpp
index 6e55d02..43e23a9 100644
--- a/flang/lib/Semantics/assignment.cpp
+++ b/flang/lib/Semantics/assignment.cpp
@@ -43,6 +43,7 @@ public:
   void Analyze(const parser::PointerAssignmentStmt &);
   void Analyze(const parser::ConcurrentControl &);
   int deviceConstructDepth_{0};
+  SemanticsContext &context() { return context_; }
 
 private:
   bool CheckForPureContext(const SomeExpr &rhs, parser::CharBlock rhsSource);
@@ -218,8 +219,17 @@ void AssignmentContext::PopWhereContext() {
 
 AssignmentChecker::~AssignmentChecker() {}
 
+SemanticsContext &AssignmentChecker::context() {
+  return context_.value().context();
+}
+
 AssignmentChecker::AssignmentChecker(SemanticsContext &context)
     : context_{new AssignmentContext{context}} {}
+
+void AssignmentChecker::Enter(
+    const parser::OpenMPDeclareReductionConstruct &x) {
+  context().set_location(x.source);
+}
 void AssignmentChecker::Enter(const parser::AssignmentStmt &x) {
   context_.value().Analyze(x);
 }
diff --git a/flang/lib/Semantics/assignment.h b/flang/lib/Semantics/assignment.h
index a67bee4..4a1bb92 100644
--- a/flang/lib/Semantics/assignment.h
+++ b/flang/lib/Semantics/assignment.h
@@ -37,6 +37,7 @@ class AssignmentChecker : public virtual BaseChecker {
 public:
   explicit AssignmentChecker(SemanticsContext &);
   ~AssignmentChecker();
+  void Enter(const parser::OpenMPDeclareReductionConstruct &x);
   void Enter(const parser::AssignmentStmt &);
   void Enter(const parser::PointerAssignmentStmt &);
   void Enter(const parser::WhereStmt &);
@@ -54,6 +55,8 @@ public:
   void Enter(const parser::OpenACCLoopConstruct &);
   void Leave(const parser::OpenACCLoopConstruct &);
 
+  SemanticsContext &context();
+
 private:
   common::Indirection<AssignmentContext> context_;
 };
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index f9d645d..bdd078c 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -8,6 +8,7 @@
 
 #include "check-omp-structure.h"
 #include "definable.h"
+#include "resolve-names-utils.h"
 #include "flang/Evaluate/check-expression.h"
 #include "flang/Evaluate/expression.h"
 #include "flang/Evaluate/type.h"
@@ -3520,6 +3521,17 @@ bool OmpStructureChecker::CheckReductionOperator(
         break;
       }
     }
+    // User-defined operators are OK if there has been a declared reduction
+    // for that. We mangle those names to store the user details.
+    if (const auto *definedOp{std::get_if<parser::DefinedOpName>(&dOpr.u)}) {
+      std::string mangled{MangleDefinedOperator(definedOp->v.symbol->name())};
+      const Scope &scope{definedOp->v.symbol->owner()};
+      if (const Symbol *symbol{scope.FindSymbol(mangled)}) {
+        if (symbol->detailsIf<UserReductionDetails>()) {
+          return true;
+        }
+      }
+    }
     context_.Say(source, "Invalid reduction operator in %s clause."_err_en_US,
         parser::ToUpperCaseLetters(getClauseName(clauseId).str()));
     return false;
@@ -3533,8 +3545,7 @@ bool OmpStructureChecker::CheckReductionOperator(
       valid =
           llvm::is_contained({"max", "min", "iand", "ior", "ieor"}, realName);
       if (!valid) {
-        auto *misc{name->symbol->detailsIf<MiscDetails>()};
-        valid = misc && misc->kind() == MiscDetails::Kind::ConstructName;
+        valid = name->symbol->detailsIf<UserReductionDetails>();
       }
     }
     if (!valid) {
@@ -3614,8 +3625,20 @@ void OmpStructureChecker::CheckReductionObjects(
   }
 }
 
+static bool CheckSymbolSupportsType(const Scope &scope,
+    const parser::CharBlock &name, const DeclTypeSpec &type) {
+  if (const auto *symbol{scope.FindSymbol(name)}) {
+    if (const auto *reductionDetails{
+            symbol->detailsIf<UserReductionDetails>()}) {
+      return reductionDetails->SupportsType(type);
+    }
+  }
+  return false;
+}
+
 static bool IsReductionAllowedForType(
-    const parser::OmpReductionIdentifier &ident, const DeclTypeSpec &type) {
+    const parser::OmpReductionIdentifier &ident, const DeclTypeSpec &type,
+    const Scope &scope, SemanticsContext &context) {
   auto isLogical{[](const DeclTypeSpec &type) -> bool {
     return type.category() == DeclTypeSpec::Logical;
   }};
@@ -3635,27 +3658,40 @@ static bool IsReductionAllowedForType(
       case parser::DefinedOperator::IntrinsicOperator::Multiply:
       case parser::DefinedOperator::IntrinsicOperator::Add:
       case parser::DefinedOperator::IntrinsicOperator::Subtract:
-        return type.IsNumeric(TypeCategory::Integer) ||
+        if (type.IsNumeric(TypeCategory::Integer) ||
             type.IsNumeric(TypeCategory::Real) ||
-            type.IsNumeric(TypeCategory::Complex);
+            type.IsNumeric(TypeCategory::Complex))
+          return true;
+        break;
 
       case parser::DefinedOperator::IntrinsicOperator::AND:
       case parser::DefinedOperator::IntrinsicOperator::OR:
       case parser::DefinedOperator::IntrinsicOperator::EQV:
       case parser::DefinedOperator::IntrinsicOperator::NEQV:
-        return isLogical(type);
+        if (isLogical(type)) {
+          return true;
+        }
+        break;
 
       // Reduction identifier is not in OMP5.2 Table 5.2
       default:
         DIE("This should have been caught in CheckIntrinsicOperator");
         return false;
       }
-    }
-    return true;
+      parser::CharBlock name{MakeNameFromOperator(*intrinsicOp, context)};
+      return CheckSymbolSupportsType(scope, name, type);
+    } else if (const auto *definedOp{
+                   std::get_if<parser::DefinedOpName>(&dOpr.u)}) {
+      return CheckSymbolSupportsType(
+          scope, MangleDefinedOperator(definedOp->v.symbol->name()), type);
+    }
+    llvm_unreachable(
+        "A DefinedOperator is either a DefinedOpName or an IntrinsicOperator");
   }};
 
   auto checkDesignator{[&](const parser::ProcedureDesignator &procD) {
     const parser::Name *name{std::get_if<parser::Name>(&procD.u)};
+    CHECK(name && name->symbol);
     if (name && name->symbol) {
       const SourceName &realName{name->symbol->GetUltimate().name()};
       // OMP5.2: The type [...] of a list item that appears in a
@@ -3664,18 +3700,35 @@ static bool IsReductionAllowedForType(
         // IAND: arguments must be integers: F2023 16.9.100
         // IEOR: arguments must be integers: F2023 16.9.106
         // IOR: arguments must be integers: F2023 16.9.111
-        return type.IsNumeric(TypeCategory::Integer);
+        if (type.IsNumeric(TypeCategory::Integer)) {
+          return true;
+        }
       } else if (realName == "max" || realName == "min") {
         // MAX: arguments must be integer, real, or character:
         // F2023 16.9.135
         // MIN: arguments must be integer, real, or character:
         // F2023 16.9.141
-        return type.IsNumeric(TypeCategory::Integer) ||
-            type.IsNumeric(TypeCategory::Real) || isCharacter(type);
+        if (type.IsNumeric(TypeCategory::Integer) ||
+            type.IsNumeric(TypeCategory::Real) || isCharacter(type)) {
+          return true;
+        }
       }
+
+      // If we get here, it may be a user declared reduction, so check
+      // if the symbol has UserReductionDetails, and if so, the type is
+      // supported.
+      if (const auto *reductionDetails{
+              name->symbol->detailsIf<UserReductionDetails>()}) {
+        return reductionDetails->SupportsType(type);
+      }
+
+      // We also need to check for mangled names (max, min, iand, ieor and ior)
+      // and then check if the type is there.
+      parser::CharBlock mangledName{MangleSpecialFunctions(name->source)};
+      return CheckSymbolSupportsType(scope, mangledName, type);
     }
-    // TODO: user defined reduction operators. Just allow everything for now.
-    return true;
+    // Everything else is "not matching type".
+    return false;
   }};
 
   return common::visit(
@@ -3690,7 +3743,8 @@ void OmpStructureChecker::CheckReductionObjectTypes(
 
   for (auto &[symbol, source] : symbols) {
     if (auto *type{symbol->GetType()}) {
-      if (!IsReductionAllowedForType(ident, *type)) {
+      const auto &scope{context_.FindScope(symbol->name())};
+      if (!IsReductionAllowedForType(ident, *type, scope, context_)) {
         context_.Say(source,
             "The type of '%s' is incompatible with the reduction operator."_err_en_US,
             symbol->name());
diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp
index a1ec956..a726418 100644
--- a/flang/lib/Semantics/mod-file.cpp
+++ b/flang/lib/Semantics/mod-file.cpp
@@ -894,6 +894,7 @@ void ModFileWriter::PutEntity(llvm::raw_ostream &os, const Symbol &symbol) {
           [&](const ObjectEntityDetails &) { PutObjectEntity(os, symbol); },
           [&](const ProcEntityDetails &) { PutProcEntity(os, symbol); },
           [&](const TypeParamDetails &) { PutTypeParam(os, symbol); },
+          [&](const UserReductionDetails &) { PutUserReduction(os, symbol); },
           [&](const auto &) {
             common::die("PutEntity: unexpected details: %s",
                 DetailsToString(symbol.details()).c_str());
@@ -1043,6 +1044,28 @@ void ModFileWriter::PutTypeParam(llvm::raw_ostream &os, const Symbol &symbol) {
   os << '\n';
 }
 
+void ModFileWriter::PutUserReduction(
+    llvm::raw_ostream &os, const Symbol &symbol) {
+  const auto &details{symbol.get<UserReductionDetails>()};
+  // The module content for a OpenMP Declare Reduction is the OpenMP
+  // declaration. There may be multiple declarations.
+  // Decls are pointers, so do not use a reference.
+  for (const auto decl : details.GetDeclList()) {
+    common::visit( //
+        common::visitors{//
+            [&](const parser::OpenMPDeclareReductionConstruct *d) {
+              Unparse(os, *d, context_.langOptions());
+            },
+            [&](const parser::OmpMetadirectiveDirective *m) {
+              Unparse(os, *m, context_.langOptions());
+            },
+            [&](const auto &) {
+              DIE("Unknown OpenMP DECLARE REDUCTION content");
+            }},
+        decl);
+  }
+}
+
 void PutInit(llvm::raw_ostream &os, const Symbol &symbol, const MaybeExpr &init,
     const parser::Expr *unanalyzed, SemanticsContext &context) {
   if (IsNamedConstant(symbol) || symbol.owner().IsDerivedType()) {
diff --git a/flang/lib/Semantics/mod-file.h b/flang/lib/Semantics/mod-file.h
index 82538fb..9e57240 100644
--- a/flang/lib/Semantics/mod-file.h
+++ b/flang/lib/Semantics/mod-file.h
@@ -80,6 +80,7 @@ private:
   void PutDerivedType(const Symbol &, const Scope * = nullptr);
   void PutDECStructure(const Symbol &, const Scope * = nullptr);
   void PutTypeParam(llvm::raw_ostream &, const Symbol &);
+  void PutUserReduction(llvm::raw_ostream &, const Symbol &);
   void PutSubprogram(const Symbol &);
   void PutGeneric(const Symbol &);
   void PutUse(const Symbol &);
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index ec385b7..65823ad 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -2217,6 +2217,20 @@ static bool IsPrivatizable(const Symbol *sym) {
               misc->kind() != MiscDetails::Kind::ConstructName));
 }
 
+static bool IsSymbolStaticStorageDuration(const Symbol &symbol) {
+  LLVM_DEBUG(llvm::dbgs() << "IsSymbolStaticStorageDuration(" << symbol.name()
+                          << "):\n");
+  auto ultSym = symbol.GetUltimate();
+  // Module-scope variable
+  return (ultSym.owner().kind() == Scope::Kind::Module) ||
+      // Data statement variable
+      (ultSym.flags().test(Symbol::Flag::InDataStmt)) ||
+      // Save attribute variable
+      (ultSym.attrs().test(Attr::SAVE)) ||
+      // Referenced in a common block
+      (ultSym.flags().test(Symbol::Flag::InCommonBlock));
+}
+
 void OmpAttributeVisitor::CreateImplicitSymbols(const Symbol *symbol) {
   if (!IsPrivatizable(symbol)) {
     return;
@@ -2310,6 +2324,7 @@ void OmpAttributeVisitor::CreateImplicitSymbols(const Symbol *symbol) {
     bool targetDir = llvm::omp::allTargetSet.test(dirContext.directive);
     bool parallelDir = llvm::omp::allParallelSet.test(dirContext.directive);
     bool teamsDir = llvm::omp::allTeamsSet.test(dirContext.directive);
+    bool isStaticStorageDuration = IsSymbolStaticStorageDuration(*symbol);
 
     if (dsa.any()) {
       if (parallelDir || taskGenDir || teamsDir) {
@@ -2367,7 +2382,7 @@ void OmpAttributeVisitor::CreateImplicitSymbols(const Symbol *symbol) {
       dsa = prevDSA;
     } else if (taskGenDir) {
       // TODO 5) dummy arg in orphaned taskgen construct -> firstprivate
-      if (prevDSA.test(Symbol::Flag::OmpShared)) {
+      if (prevDSA.test(Symbol::Flag::OmpShared) || isStaticStorageDuration) {
         // 6) shared in enclosing context -> shared
         dsa = {Symbol::Flag::OmpShared};
         makeSymbol(dsa);
@@ -2886,20 +2901,6 @@ void ResolveOmpTopLevelParts(
   });
 }
 
-static bool IsSymbolInCommonBlock(const Symbol &symbol) {
-  // TODO Improve the performance of this predicate function.
-  //      Going through all symbols sequentially, in all common blocks, can be
-  //      slow when there are many symbols. A possible optimization is to add
-  //      an OmpInCommonBlock flag to Symbol, to make it possible to quickly
-  //      test if a given symbol is in a common block.
-  for (const auto &cb : symbol.owner().commonBlocks()) {
-    if (IsCommonBlockContaining(cb.second.get(), symbol)) {
-      return true;
-    }
-  }
-  return false;
-}
-
 static bool IsSymbolThreadprivate(const Symbol &symbol) {
   if (const auto *details{symbol.detailsIf<HostAssocDetails>()}) {
     return details->symbol().test(Symbol::Flag::OmpThreadprivate);
@@ -2928,7 +2929,7 @@ static bool IsSymbolPrivate(const Symbol &symbol) {
     case Scope::Kind::BlockConstruct:
       return !symbol.attrs().test(Attr::SAVE) &&
           !symbol.attrs().test(Attr::PARAMETER) && !IsAssumedShape(symbol) &&
-          !IsSymbolInCommonBlock(symbol);
+          !symbol.flags().test(Symbol::Flag::InCommonBlock);
     default:
       return false;
     }
diff --git a/flang/lib/Semantics/resolve-names-utils.h b/flang/lib/Semantics/resolve-names-utils.h
index 6478472..ee8113a 100644
--- a/flang/lib/Semantics/resolve-names-utils.h
+++ b/flang/lib/Semantics/resolve-names-utils.h
@@ -146,5 +146,11 @@ struct SymbolAndTypeMappings;
 void MapSubprogramToNewSymbols(const Symbol &oldSymbol, Symbol &newSymbol,
     Scope &newScope, SymbolAndTypeMappings * = nullptr);
 
+parser::CharBlock MakeNameFromOperator(
+    const parser::DefinedOperator::IntrinsicOperator &op,
+    SemanticsContext &context);
+parser::CharBlock MangleSpecialFunctions(const parser::CharBlock &name);
+std::string MangleDefinedOperator(const parser::CharBlock &name);
+
 } // namespace Fortran::semantics
 #endif // FORTRAN_SEMANTICS_RESOLVE_NAMES_H_
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 297007b..3e133b1 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -38,6 +38,7 @@
 #include "flang/Semantics/type.h"
 #include "flang/Support/Fortran.h"
 #include "flang/Support/default-kinds.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/raw_ostream.h"
 #include <list>
 #include <map>
@@ -1467,11 +1468,15 @@ public:
   static bool NeedsScope(const parser::OpenMPBlockConstruct &);
   static bool NeedsScope(const parser::OmpClause &);
 
-  bool Pre(const parser::OmpMetadirectiveDirective &) {
+  bool Pre(const parser::OmpMetadirectiveDirective &x) { //
+    metaDirective_ = &x;
     ++metaLevel_;
     return true;
   }
-  void Post(const parser::OmpMetadirectiveDirective &) { --metaLevel_; }
+  void Post(const parser::OmpMetadirectiveDirective &) { //
+    metaDirective_ = nullptr;
+    --metaLevel_;
+  }
 
   bool Pre(const parser::OpenMPRequiresConstruct &x) {
     AddOmpSourceRange(x.source);
@@ -1522,7 +1527,7 @@ public:
     auto *symbol{FindSymbol(NonDerivedTypeScope(), name)};
     if (!symbol) {
       context().Say(name.source,
-          "Implicit subroutine declaration '%s' in !$OMP DECLARE REDUCTION"_err_en_US,
+          "Implicit subroutine declaration '%s' in DECLARE REDUCTION"_err_en_US,
           name.source);
     }
     return true;
@@ -1551,7 +1556,7 @@ public:
     AddOmpSourceRange(x.source);
     ProcessReductionSpecifier(
         std::get<Indirection<parser::OmpReductionSpecifier>>(x.t).value(),
-        std::get<std::optional<parser::OmpClauseList>>(x.t));
+        std::get<std::optional<parser::OmpClauseList>>(x.t), x);
     return false;
   }
   bool Pre(const parser::OmpMapClause &);
@@ -1712,9 +1717,13 @@ public:
 private:
   void ProcessMapperSpecifier(const parser::OmpMapperSpecifier &spec,
       const parser::OmpClauseList &clauses);
+  template <typename T>
   void ProcessReductionSpecifier(const parser::OmpReductionSpecifier &spec,
-      const std::optional<parser::OmpClauseList> &clauses);
+      const std::optional<parser::OmpClauseList> &clauses,
+      const T &wholeConstruct);
+
   int metaLevel_{0};
+  const parser::OmpMetadirectiveDirective *metaDirective_{nullptr};
 };
 
 bool OmpVisitor::NeedsScope(const parser::OpenMPBlockConstruct &x) {
@@ -1801,14 +1810,92 @@ void OmpVisitor::ProcessMapperSpecifier(const parser::OmpMapperSpecifier &spec,
   PopScope();
 }
 
+parser::CharBlock MakeNameFromOperator(
+    const parser::DefinedOperator::IntrinsicOperator &op,
+    SemanticsContext &context) {
+  switch (op) {
+  case parser::DefinedOperator::IntrinsicOperator::Multiply:
+    return parser::CharBlock{"op.*", 4};
+  case parser::DefinedOperator::IntrinsicOperator::Add:
+    return parser::CharBlock{"op.+", 4};
+  case parser::DefinedOperator::IntrinsicOperator::Subtract:
+    return parser::CharBlock{"op.-", 4};
+
+  case parser::DefinedOperator::IntrinsicOperator::AND:
+    return parser::CharBlock{"op.AND", 6};
+  case parser::DefinedOperator::IntrinsicOperator::OR:
+    return parser::CharBlock{"op.OR", 6};
+  case parser::DefinedOperator::IntrinsicOperator::EQV:
+    return parser::CharBlock{"op.EQV", 7};
+  case parser::DefinedOperator::IntrinsicOperator::NEQV:
+    return parser::CharBlock{"op.NEQV", 8};
+
+  default:
+    context.Say("Unsupported operator in DECLARE REDUCTION"_err_en_US);
+    return parser::CharBlock{"op.?", 4};
+  }
+}
+
+parser::CharBlock MangleSpecialFunctions(const parser::CharBlock &name) {
+  return llvm::StringSwitch<parser::CharBlock>(name.ToString())
+      .Case("max", {"op.max", 6})
+      .Case("min", {"op.min", 6})
+      .Case("iand", {"op.iand", 7})
+      .Case("ior", {"op.ior", 6})
+      .Case("ieor", {"op.ieor", 7})
+      .Default(name);
+}
+
+std::string MangleDefinedOperator(const parser::CharBlock &name) {
+  CHECK(name[0] == '.' && name[name.size() - 1] == '.');
+  return "op" + name.ToString();
+}
+
+template <typename T>
 void OmpVisitor::ProcessReductionSpecifier(
     const parser::OmpReductionSpecifier &spec,
-    const std::optional<parser::OmpClauseList> &clauses) {
+    const std::optional<parser::OmpClauseList> &clauses,
+    const T &wholeOmpConstruct) {
+  const parser::Name *name{nullptr};
+  parser::CharBlock mangledName;
+  UserReductionDetails reductionDetailsTemp;
   const auto &id{std::get<parser::OmpReductionIdentifier>(spec.t)};
-  if (auto procDes{std::get_if<parser::ProcedureDesignator>(&id.u)}) {
-    if (auto *name{std::get_if<parser::Name>(&procDes->u)}) {
-      name->symbol =
-          &MakeSymbol(*name, MiscDetails{MiscDetails::Kind::ConstructName});
+  if (auto *procDes{std::get_if<parser::ProcedureDesignator>(&id.u)}) {
+    name = std::get_if<parser::Name>(&procDes->u);
+    // This shouldn't be a procedure component: this is the name of the
+    // reduction being declared.
+    CHECK(name);
+    // Prevent the symbol from conflicting with the builtin function name
+    mangledName = MangleSpecialFunctions(name->source);
+    // Note: the Name inside the parse tree is not updated because it is const.
+    // All lookups must use MangleSpecialFunctions.
+  } else {
+    const auto &defOp{std::get<parser::DefinedOperator>(id.u)};
+    if (const auto *definedOp{std::get_if<parser::DefinedOpName>(&defOp.u)}) {
+      name = &definedOp->v;
+      mangledName = context().SaveTempName(MangleDefinedOperator(name->source));
+    } else {
+      mangledName = MakeNameFromOperator(
+          std::get<parser::DefinedOperator::IntrinsicOperator>(defOp.u),
+          context());
+    }
+  }
+
+  // Use reductionDetailsTemp if we can't find the symbol (this is
+  // the first, or only, instance with this name). The details then
+  // gets stored in the symbol when it's created.
+  UserReductionDetails *reductionDetails{&reductionDetailsTemp};
+  Symbol *symbol{currScope().FindSymbol(mangledName)};
+  if (symbol) {
+    // If we found a symbol, we append the type info to the
+    // existing reductionDetails.
+    reductionDetails = symbol->detailsIf<UserReductionDetails>();
+
+    if (!reductionDetails) {
+      context().Say(
+          "Duplicate definition of '%s' in DECLARE REDUCTION"_err_en_US,
+          mangledName);
+      return;
     }
   }
 
@@ -1838,19 +1925,31 @@ void OmpVisitor::ProcessReductionSpecifier(
     // We need to walk t.u because Walk(t) does it's own BeginDeclTypeSpec.
     Walk(t.u);
 
-    const DeclTypeSpec *typeSpec{GetDeclTypeSpec()};
-    assert(typeSpec && "We should have a type here");
+    // Only process types we can find. There will be an error later on when
+    // a type isn't found.
+    if (const DeclTypeSpec *typeSpec{GetDeclTypeSpec()}) {
+      reductionDetails->AddType(*typeSpec);
 
-    for (auto &nm : ompVarNames) {
-      ObjectEntityDetails details{};
-      details.set_type(*typeSpec);
-      MakeSymbol(nm, Attrs{}, std::move(details));
+      for (auto &nm : ompVarNames) {
+        ObjectEntityDetails details{};
+        details.set_type(*typeSpec);
+        MakeSymbol(nm, Attrs{}, std::move(details));
+      }
     }
     EndDeclTypeSpec();
     Walk(std::get<std::optional<parser::OmpReductionCombiner>>(spec.t));
     Walk(clauses);
     PopScope();
   }
+
+  reductionDetails->AddDecl(&wholeOmpConstruct);
+
+  if (!symbol) {
+    symbol = &MakeSymbol(mangledName, Attrs{}, std::move(*reductionDetails));
+  }
+  if (name) {
+    name->symbol = symbol;
+  }
 }
 
 bool OmpVisitor::Pre(const parser::OmpDirectiveSpecification &x) {
@@ -1880,7 +1979,8 @@ bool OmpVisitor::Pre(const parser::OmpDirectiveSpecification &x) {
     if (maybeArgs && maybeClauses) {
       const parser::OmpArgument &first{maybeArgs->v.front()};
       if (auto *spec{std::get_if<parser::OmpReductionSpecifier>(&first.u)}) {
-        ProcessReductionSpecifier(*spec, maybeClauses);
+        CHECK(metaDirective_);
+        ProcessReductionSpecifier(*spec, maybeClauses, *metaDirective_);
       }
     }
     break;
@@ -6820,6 +6920,9 @@ bool DeclarationVisitor::Pre(const parser::CommonBlockObject &) {
 
 void DeclarationVisitor::Post(const parser::CommonBlockObject &x) {
   const auto &name{std::get<parser::Name>(x.t)};
+  if (auto *symbol{FindSymbol(name)}) {
+    symbol->set(Symbol::Flag::InCommonBlock);
+  }
   DeclareObjectEntity(name);
   auto pair{specPartState_.commonBlockObjects.insert(name.source)};
   if (!pair.second) {
diff --git a/flang/lib/Semantics/symbol.cpp b/flang/lib/Semantics/symbol.cpp
index 52f7403..0380207 100644
--- a/flang/lib/Semantics/symbol.cpp
+++ b/flang/lib/Semantics/symbol.cpp
@@ -292,8 +292,7 @@ void GenericDetails::CopyFrom(const GenericDetails &from) {
 // This is primarily for debugging.
 std::string DetailsToString(const Details &details) {
   return common::visit(
-      common::visitors{
-          [](const UnknownDetails &) { return "Unknown"; },
+      common::visitors{[](const UnknownDetails &) { return "Unknown"; },
           [](const MainProgramDetails &) { return "MainProgram"; },
           [](const ModuleDetails &) { return "Module"; },
           [](const SubprogramDetails &) { return "Subprogram"; },
@@ -312,7 +311,7 @@ std::string DetailsToString(const Details &details) {
           [](const TypeParamDetails &) { return "TypeParam"; },
           [](const MiscDetails &) { return "Misc"; },
           [](const AssocEntityDetails &) { return "AssocEntity"; },
-      },
+          [](const UserReductionDetails &) { return "UserReductionDetails"; }},
       details);
 }
 
@@ -346,6 +345,9 @@ bool Symbol::CanReplaceDetails(const Details &details) const {
             [&](const HostAssocDetails &) {
               return this->has<HostAssocDetails>();
             },
+            [&](const UserReductionDetails &) {
+              return this->has<UserReductionDetails>();
+            },
             [](const auto &) { return false; },
         },
         details);
@@ -644,6 +646,11 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const Details &details) {
           [&](const MiscDetails &x) {
             os << ' ' << MiscDetails::EnumToString(x.kind());
           },
+          [&](const UserReductionDetails &x) {
+            for (auto &type : x.GetTypeList()) {
+              DumpType(os, type);
+            }
+          },
           [&](const auto &x) { os << x; },
       },
       details);
diff --git a/flang/test/Integration/debug-local-var-2.f90 b/flang/test/Integration/debug-local-var-2.f90
index 468bb0c..b97be14 100644
--- a/flang/test/Integration/debug-local-var-2.f90
+++ b/flang/test/Integration/debug-local-var-2.f90
@@ -1,6 +1,5 @@
-! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -mllvm --experimental-debuginfo-iterators=true -o - | FileCheck %s --check-prefixes=BOTH,RECORDS
-! RUN: %flang_fc1 -emit-llvm -debug-info-kind=line-tables-only %s -mllvm --experimental-debuginfo-iterators=false -o - | FileCheck --check-prefix=LINEONLY %s
-! RUN: %flang_fc1 -emit-llvm -debug-info-kind=line-tables-only %s -mllvm --experimental-debuginfo-iterators=true -o - | FileCheck --check-prefix=LINEONLY %s
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck %s --check-prefixes=BOTH,RECORDS
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=line-tables-only %s -o - | FileCheck --check-prefix=LINEONLY %s
 
 ! This tests checks the debug information for local variables in llvm IR.
 
diff --git a/flang/test/Parser/OpenMP/declare-reduction-multi.f90 b/flang/test/Parser/OpenMP/declare-reduction-multi.f90
new file mode 100644
index 0000000..0e1adcc
--- /dev/null
+++ b/flang/test/Parser/OpenMP/declare-reduction-multi.f90
@@ -0,0 +1,134 @@
+! RUN: %flang_fc1 -fdebug-unparse -fopenmp %s | FileCheck --ignore-case %s
+! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp %s | FileCheck --check-prefix="PARSE-TREE" %s
+
+!! Test multiple declarations for the same type, with different operations.
+module mymod
+  type :: tt
+     real r
+  end type tt
+contains
+  function mymax(a, b)
+    type(tt) :: a, b, mymax
+    if (a%r > b%r) then
+       mymax = a
+    else
+       mymax = b
+    end if
+  end function mymax
+end module mymod
+
+program omp_examples
+!CHECK-LABEL: PROGRAM omp_examples
+  use mymod
+  implicit none
+  integer, parameter :: n = 100
+  integer :: i
+  type(tt) :: values(n), sum, prod, big, small
+
+  !$omp declare reduction(+:tt:omp_out%r = omp_out%r + omp_in%r) initializer(omp_priv%r = 0)
+!CHECK: !$OMP DECLARE REDUCTION (+:tt: omp_out%r=omp_out%r+omp_in%r
+!CHECK-NEXT: ) INITIALIZER(omp_priv%r=0_4)
+!PARSE-TREE:  DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct
+!PARSE-TREE:  Verbatim
+!PARSE-TREE: OmpReductionSpecifier
+!PARSE-TREE-NEXT: OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
+!PARSE-TREE: OmpTypeNameList -> OmpTypeSpecifier -> TypeSpec -> DerivedTypeSpec
+!PARSE-TREE-NEXT:  Name = 'tt'
+!PARSE-TREE: OmpReductionCombiner -> AssignmentStmt = 'omp_out%r=omp_out%r+omp_in%r'
+!PARSE-TREE: OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv%r=0._4
+  !$omp declare reduction(*:tt:omp_out%r = omp_out%r * omp_in%r) initializer(omp_priv%r = 1)
+!CHECK-NEXT: !$OMP DECLARE REDUCTION (*:tt: omp_out%r=omp_out%r*omp_in%r
+!CHECK-NEXT: ) INITIALIZER(omp_priv%r=1_4)
+!PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct
+!PARSE-TREE: Verbatim
+!PARSE-TREE: OmpReductionSpecifier
+!PARSE-TREE: OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Multiply
+!PARSE-TREE: OmpTypeNameList -> OmpTypeSpecifier -> TypeSpec -> DerivedTypeSpec
+!PARSE-TREE-NEXT:  Name = 'tt'
+!PARSE-TREE: OmpReductionCombiner -> AssignmentStmt = 'omp_out%r=omp_out%r*omp_in%r'
+!PARSE-TREE: OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv%r=1._4'
+  !$omp declare reduction(max:tt:omp_out = mymax(omp_out, omp_in)) initializer(omp_priv%r = 0)
+!CHECK-NEXT: !$OMP DECLARE REDUCTION (max:tt: omp_out=mymax(omp_out,omp_in)
+!CHECK-NEXT: ) INITIALIZER(omp_priv%r=0_4)
+!PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct
+!PARSE-TREE: Verbatim
+!PARSE-TREE: OmpReductionSpecifier
+!PARSE-TREE: OmpReductionIdentifier -> ProcedureDesignator -> Name = 'max'
+!PARSE-TREE: OmpTypeNameList -> OmpTypeSpecifier -> TypeSpec -> DerivedTypeSpec
+!PARSE-TREE: Name = 'tt'
+!PARSE-TREE: OmpReductionCombiner -> AssignmentStmt = 'omp_out=mymax(omp_out,omp_in)'
+!PARSE-TREE: OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv%r=0._4'
+  !$omp declare reduction(min:tt:omp_out%r = min(omp_out%r, omp_in%r)) initializer(omp_priv%r = 1)
+!CHECK-NEXT: !$OMP DECLARE REDUCTION (min:tt: omp_out%r=min(omp_out%r,omp_in%r)
+!CHECK-NEXT: ) INITIALIZER(omp_priv%r=1_4)
+!PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct
+!PARSE-TREE: Verbatim
+!PARSE-TREE: OmpReductionSpecifier
+!PARSE-TREE: OmpReductionIdentifier -> ProcedureDesignator -> Name = 'min'
+!PARSE-TREE: OmpTypeNameList -> OmpTypeSpecifier -> TypeSpec -> DerivedTypeSpec
+!PARSE-TREE: Name = 'tt'
+!PARSE-TREE: OmpReductionCombiner -> AssignmentStmt = 'omp_out%r=min(omp_out%r,omp_in%r)'
+!PARSE-TREE: OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv%r=1._4'
+  call random_number(values%r)
+
+  sum%r = 0
+  !$omp parallel do reduction(+:sum)
+!CHECK: !$OMP PARALLEL DO  REDUCTION(+: sum) 
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+!PARSE-TREE: OmpBeginLoopDirective
+!PARSE-TREE: OmpLoopDirective -> llvm::omp::Directive = parallel do
+!PARSE-TREE: OmpClauseList -> OmpClause -> Reduction -> OmpReductionClause
+!PARSE-TREE: Modifier -> OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
+!PARSE-TREE: OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'sum
+!PARSE-TREE: DoConstruct
+  do i = 1, n
+     sum%r = sum%r + values(i)%r
+  end do
+
+  prod%r = 1
+  !$omp parallel do reduction(*:prod)
+!CHECK: !$OMP PARALLEL DO  REDUCTION(*: prod)
+!PARSE-TREE:  ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+!PARSE-TREE: OmpBeginLoopDirective
+!PARSE-TREE: OmpLoopDirective -> llvm::omp::Directive = parallel do
+!PARSE-TREE: OmpClauseList -> OmpClause -> Reduction -> OmpReductionClause
+!PARSE-TREE: Modifier -> OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Multiply
+!PARSE-TREE: OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'prod'
+!PARSE-TREE: DoConstruct
+  do i = 1, n
+     prod%r = prod%r * (values(i)%r+0.6)
+  end do
+
+  big%r = 0
+  !$omp parallel do reduction(max:big)
+!CHECK:  $OMP PARALLEL DO  REDUCTION(max: big) 
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+!PARSE-TREE: OmpBeginLoopDirective
+!PARSE-TREE: OmpLoopDirective -> llvm::omp::Directive = parallel do
+!PARSE-TREE: OmpClauseList -> OmpClause -> Reduction -> OmpReductionClause
+!PARSE-TREE: Modifier -> OmpReductionIdentifier -> ProcedureDesignator -> Name = 'max'
+!PARSE-TREE: OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'big'
+!PARSE-TREE: DoConstruct
+  do i = 1, n
+     big = mymax(values(i), big)
+  end do
+
+  small%r = 1
+  !$omp parallel do reduction(min:small)
+!CHECK: !$OMP PARALLEL DO  REDUCTION(min: small)
+!CHECK-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+!CHECK-TREE: OmpBeginLoopDirective
+!CHECK-TREE: OmpLoopDirective -> llvm::omp::Directive = parallel do
+!CHECK-TREE: OmpClauseList -> OmpClause -> Reduction -> OmpReductionClause
+!CHECK-TREE: Modifier -> OmpReductionIdentifier -> ProcedureDesignator -> Name = 'min'
+!CHECK-TREE: OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'small'
+!CHECK-TREE: DoConstruct
+  do i = 1, n
+     small%r = min(values(i)%r, small%r)
+  end do
+  
+  print *, values%r
+  print *, "sum=", sum%r
+  print *, "prod=", prod%r
+  print *, "small=", small%r, " big=", big%r
+end program omp_examples
diff --git a/flang/test/Parser/OpenMP/declare-reduction-operator.f90 b/flang/test/Parser/OpenMP/declare-reduction-operator.f90
new file mode 100644
index 0000000..7bfb781
--- /dev/null
+++ b/flang/test/Parser/OpenMP/declare-reduction-operator.f90
@@ -0,0 +1,59 @@
+! RUN: %flang_fc1 -fdebug-unparse -fopenmp %s | FileCheck --ignore-case %s
+! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp %s | FileCheck --check-prefix="PARSE-TREE" %s
+
+!CHECK-LABEL: SUBROUTINE reduce_1 (n, tts)
+subroutine reduce_1 ( n, tts )
+  type :: tt
+    integer :: x
+    integer :: y
+ end type tt
+  type :: tt2
+    real(8) :: x
+    real(8) :: y
+  end type
+ 
+  integer :: n
+  type(tt) :: tts(n)
+  type(tt2) :: tts2(n)
+
+!CHECK: !$OMP DECLARE REDUCTION (+:tt: omp_out=tt(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)
+!CHECK: ) INITIALIZER(omp_priv=tt(x=0_4,y=0_4))
+!PARSE-TREE:  DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct
+!PARSE-TREE: Verbatim
+!PARSE-TREE: OmpReductionSpecifier
+!PARSE-TREE: OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
+!PARSE-TREE: OmpReductionCombiner -> AssignmentStmt = 'omp_out=tt(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)'
+!PARSE-TREE:    OmpInitializerClause -> AssignmentStmt = 'omp_priv=tt(x=0_4,y=0_4)'
+  
+  !$omp declare reduction(+ : tt :  omp_out = tt(omp_out%x - omp_in%x , omp_out%y - omp_in%y)) initializer(omp_priv = tt(0,0))
+
+  
+!CHECK: !$OMP DECLARE REDUCTION (+:tt2: omp_out=tt2(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)
+!CHECK: ) INITIALIZER(omp_priv=tt2(x=0._8,y=0._8)
+!PARSE-TREE:  DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareReductionConstruct
+!PARSE-TREE: Verbatim
+!PARSE-TREE: OmpReductionSpecifier
+!PARSE-TREE: OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
+!PARSE-TREE: OmpReductionCombiner -> AssignmentStmt = 'omp_out=tt2(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)'
+!PARSE-TREE:    OmpInitializerClause -> AssignmentStmt = 'omp_priv=tt2(x=0._8,y=0._8)'
+  
+  !$omp declare reduction(+ :tt2 :  omp_out = tt2(omp_out%x - omp_in%x , omp_out%y - omp_in%y)) initializer(omp_priv = tt2(0,0))
+  
+  type(tt) :: diffp = tt( 0, 0 )
+  type(tt2) :: diffp2 = tt2( 0, 0 )
+  integer :: i
+
+  !$omp parallel do reduction(+ : diffp)
+  do i = 1, n
+     diffp%x = diffp%x + tts(i)%x
+     diffp%y = diffp%y + tts(i)%y
+  end do
+
+  !$omp parallel do reduction(+ : diffp2)
+  do i = 1, n
+     diffp2%x = diffp2%x + tts2(i)%x
+     diffp2%y = diffp2%y + tts2(i)%y
+  end do
+
+end subroutine reduce_1
+!CHECK: END SUBROUTINE reduce_1
diff --git a/flang/test/Semantics/OpenMP/common-block.f90 b/flang/test/Semantics/OpenMP/common-block.f90
index 93f29b1..adf77b0 100644
--- a/flang/test/Semantics/OpenMP/common-block.f90
+++ b/flang/test/Semantics/OpenMP/common-block.f90
@@ -1,9 +1,9 @@
 ! RUN: %flang_fc1 -fopenmp -fdebug-dump-symbols %s | FileCheck %s
 
 program main
-  !CHECK: a size=4 offset=0: ObjectEntity type: REAL(4)
-  !CHECK: b size=8 offset=4: ObjectEntity type: INTEGER(4) shape: 1_8:2_8
-  !CHECK: c size=4 offset=12: ObjectEntity type: REAL(4)
+  !CHECK: a (InCommonBlock) size=4 offset=0: ObjectEntity type: REAL(4)
+  !CHECK: b (InCommonBlock) size=8 offset=4: ObjectEntity type: INTEGER(4) shape: 1_8:2_8
+  !CHECK: c (InCommonBlock) size=4 offset=12: ObjectEntity type: REAL(4)
   !CHECK: blk size=16 offset=0: CommonBlockDetails alignment=4: a b c
   real :: a, c
   integer :: b(2)
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-bad-operator.f90 b/flang/test/Semantics/OpenMP/declare-reduction-bad-operator.f90
new file mode 100644
index 0000000..1d1d290
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/declare-reduction-bad-operator.f90
@@ -0,0 +1,6 @@
+! RUN: not %flang_fc1 -fdebug-dump-symbols -fopenmp -fopenmp-version=50 %s 2>&1 | FileCheck %s
+
+function func(n)
+    !$omp declare reduction(/:integer:omp_out=omp_out+omp_in)
+!CHECK: error: Unsupported operator in DECLARE REDUCTION
+end function func
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-bad-operator2.f90 b/flang/test/Semantics/OpenMP/declare-reduction-bad-operator2.f90
new file mode 100644
index 0000000..9ee223c
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/declare-reduction-bad-operator2.f90
@@ -0,0 +1,28 @@
+! RUN: not %flang_fc1 -fdebug-dump-symbols -fopenmp -fopenmp-version=50 %s 2>&1 | FileCheck %s
+
+module m1
+  interface operator(.fluffy.)
+    procedure my_mul
+  end interface
+  type t1
+    integer :: val = 1
+  end type
+!$omp declare reduction(.fluffy.:t1:omp_out=omp_out.fluffy.omp_in)
+contains
+  function my_mul(x, y)
+    type (t1), intent (in) :: x, y
+    type (t1) :: my_mul
+    my_mul%val = x%val * y%val
+  end function my_mul
+
+  subroutine subr(a, r)
+    implicit none
+    integer, intent(in), dimension(10) :: a
+    integer, intent(out) :: r
+    integer :: i
+    !$omp do parallel reduction(.fluffy.:r)
+!CHECK: error: The type of 'r' is incompatible with the reduction operator.
+    do i=1,10
+    end do
+  end subroutine subr
+end module m1
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-dupsym.f90 b/flang/test/Semantics/OpenMP/declare-reduction-dupsym.f90
new file mode 100644
index 0000000..83f8f85
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/declare-reduction-dupsym.f90
@@ -0,0 +1,15 @@
+! RUN: not %flang_fc1 -fopenmp -fopenmp-version=50 %s 2>&1 | FileCheck %s
+
+!! Check for duplicate symbol use.
+subroutine dup_symbol()
+  type :: loc
+     integer :: x
+     integer :: y
+  end type loc
+ 
+  integer :: my_red
+
+!CHECK: error: Duplicate definition of 'my_red' in DECLARE REDUCTION
+  !$omp declare reduction(my_red : loc :  omp_out%x = omp_out%x + omp_in%x) initializer(omp_priv%x = 0)
+  
+end subroutine dup_symbol
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-error.f90 b/flang/test/Semantics/OpenMP/declare-reduction-error.f90
index c22cf10..21f5cc1 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction-error.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction-error.f90
@@ -7,5 +7,5 @@ end subroutine initme
 
 subroutine subr
   !$omp declare reduction(red_add:integer(4):omp_out=omp_out+omp_in) initializer(initme(omp_priv,0))
-  !CHECK: error: Implicit subroutine declaration 'initme' in !$OMP DECLARE REDUCTION
+  !CHECK: error: Implicit subroutine declaration 'initme' in DECLARE REDUCTION
 end subroutine subr
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-functions.f90 b/flang/test/Semantics/OpenMP/declare-reduction-functions.f90
new file mode 100644
index 0000000..000d323
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/declare-reduction-functions.f90
@@ -0,0 +1,203 @@
+! RUN: %flang_fc1 -fdebug-dump-symbols -fopenmp -fopenmp-version=50 %s | FileCheck %s
+
+module mm
+  implicit none
+  type two
+     integer(4) :: a, b
+  end type two
+
+  type three
+     integer(8) :: a, b, c
+  end type three
+
+  type twothree
+     type(two) t2
+     type(three) t3
+  end type twothree
+
+contains
+!CHECK-LABEL: Subprogram scope: inittwo
+  subroutine inittwo(x,n)
+    integer :: n
+    type(two) :: x
+    x%a=n
+    x%b=n
+  end subroutine inittwo
+  
+  subroutine initthree(x,n)
+    integer :: n
+    type(three) :: x
+    x%a=n
+    x%b=n
+  end subroutine initthree
+
+  function add_two(x, y)
+    type(two) add_two, x, y, res
+    res%a = x%a + y%a
+    res%b = x%b + y%b
+    add_two = res
+  end function add_two
+
+  function add_three(x, y)
+    type(three) add_three, x, y, res
+    res%a = x%a + y%a
+    res%b = x%b + y%b
+    res%c = x%c + y%c
+    add_three = res
+  end function add_three
+  
+!CHECK-LABEL: Subprogram scope: functwo
+  function functwo(x, n)
+    type(two) functwo
+    integer :: n
+    type(two) ::  x(n)
+    type(two) :: res
+    integer :: i
+    !$omp declare reduction(adder:two:omp_out=add_two(omp_out,omp_in)) initializer(inittwo(omp_priv,0))
+!CHECK: adder: UserReductionDetails TYPE(two)
+!CHECK OtherConstruct scope
+!CHECK: omp_in size=8 offset=0: ObjectEntity type: TYPE(two)
+!CHECK: omp_orig size=8 offset=8: ObjectEntity type: TYPE(two)
+!CHECK: omp_out size=8 offset=16: ObjectEntity type: TYPE(two)
+!CHECK: omp_priv size=8 offset=24: ObjectEntity type: TYPE(two)
+    
+  
+    !$omp simd reduction(adder:res)
+    do i=1,n
+       res=add_two(res,x(i))
+    enddo
+    functwo=res
+  end function functwo
+
+  function functhree(x, n)
+    implicit none
+    type(three) :: functhree
+    type(three) :: x(n)
+    type(three) :: res
+    integer :: i
+    integer :: n
+    !$omp declare reduction(adder:three:omp_out=add_three(omp_out,omp_in)) initializer(initthree(omp_priv,1))
+    
+    !$omp simd reduction(adder:res)
+    do i=1,n
+       res=add_three(res,x(i))
+    enddo
+    functhree=res
+  end function functhree
+  
+  function functwothree(x, n)
+    type(twothree) :: functwothree
+    type(twothree) :: x(n)
+    type(twothree) :: res
+    type(two) :: res2
+    type(three) :: res3
+    integer :: n
+    integer :: i
+
+    !$omp declare reduction(adder:two:omp_out=add_two(omp_out,omp_in)) initializer(inittwo(omp_priv,0))
+    
+    !$omp declare reduction(adder:three:omp_out=add_three(omp_out,omp_in)) initializer(initthree(omp_priv,1))
+    
+!CHECK: adder: UserReductionDetails TYPE(two) TYPE(three)
+!CHECK OtherConstruct scope
+!CHECK: omp_in size=8 offset=0: ObjectEntity type: TYPE(two)
+!CHECK: omp_orig size=8 offset=8: ObjectEntity type: TYPE(two)
+!CHECK: omp_out size=8 offset=16: ObjectEntity type: TYPE(two)
+!CHECK: omp_priv size=8 offset=24: ObjectEntity type: TYPE(two)
+!CHECK OtherConstruct scope
+!CHECK: omp_in size=24 offset=0: ObjectEntity type: TYPE(three)
+!CHECK: omp_orig size=24 offset=24: ObjectEntity type: TYPE(three)
+!CHECK: omp_out size=24 offset=48: ObjectEntity type: TYPE(three)
+!CHECK: omp_priv size=24 offset=72: ObjectEntity type: TYPE(three)
+
+    !$omp simd reduction(adder:res3)
+    do i=1,n
+       res3=add_three(res%t3,x(i)%t3)
+    enddo
+
+    !$omp simd reduction(adder:res2)
+    do i=1,n
+       res2=add_two(res2,x(i)%t2)
+    enddo
+    res%t2 = res2
+    res%t3 = res3
+    functwothree=res
+  end function functwothree
+
+!CHECK-LABEL: Subprogram scope: funcbtwo
+  function funcBtwo(x, n)
+    type(two) funcBtwo
+    integer :: n
+    type(two) ::  x(n)
+    type(two) :: res
+    integer :: i
+    !$omp declare reduction(+:two:omp_out=add_two(omp_out,omp_in)) initializer(inittwo(omp_priv,0))
+!CHECK: op.+: UserReductionDetails TYPE(two)
+!CHECK OtherConstruct scope
+!CHECK: omp_in size=8 offset=0: ObjectEntity type: TYPE(two)
+!CHECK: omp_orig size=8 offset=8: ObjectEntity type: TYPE(two)
+!CHECK: omp_out size=8 offset=16: ObjectEntity type: TYPE(two)
+!CHECK: omp_priv size=8 offset=24: ObjectEntity type: TYPE(two)
+    
+  
+    !$omp simd reduction(+:res)
+    do i=1,n
+       res=add_two(res,x(i))
+    enddo
+    funcBtwo=res
+  end function funcBtwo
+
+  function funcBtwothree(x, n)
+    type(twothree) :: funcBtwothree
+    type(twothree) :: x(n)
+    type(twothree) :: res
+    type(two) :: res2
+    type(three) :: res3
+    integer :: n
+    integer :: i
+
+    !$omp declare reduction(+:two:omp_out=add_two(omp_out,omp_in)) initializer(inittwo(omp_priv,0))
+    
+    !$omp declare reduction(+:three:omp_out=add_three(omp_out,omp_in)) initializer(initthree(omp_priv,1))
+    
+!CHECK: op.+: UserReductionDetails TYPE(two) TYPE(three)
+!CHECK OtherConstruct scope
+!CHECK: omp_in size=8 offset=0: ObjectEntity type: TYPE(two)
+!CHECK: omp_orig size=8 offset=8: ObjectEntity type: TYPE(two)
+!CHECK: omp_out size=8 offset=16: ObjectEntity type: TYPE(two)
+!CHECK: omp_priv size=8 offset=24: ObjectEntity type: TYPE(two)
+!CHECK: OtherConstruct scope
+!CHECK: omp_in size=24 offset=0: ObjectEntity type: TYPE(three)
+!CHECK: omp_orig size=24 offset=24: ObjectEntity type: TYPE(three)
+!CHECK: omp_out size=24 offset=48: ObjectEntity type: TYPE(three)
+!CHECK: omp_priv size=24 offset=72: ObjectEntity type: TYPE(three)
+
+    !$omp simd reduction(+:res3)
+    do i=1,n
+       res3=add_three(res%t3,x(i)%t3)
+    enddo
+
+    !$omp simd reduction(+:res2)
+    do i=1,n
+       res2=add_two(res2,x(i)%t2)
+    enddo
+    res%t2 = res2
+    res%t3 = res3
+  end function funcBtwothree
+
+  !! This is checking a special case, where a reduction is declared inside a
+  !! pure function
+
+  pure logical function reduction()
+!CHECK: reduction size=4 offset=0: ObjectEntity funcResult type: LOGICAL(4)
+!CHECK: rr: UserReductionDetails INTEGER(4)
+!CHECK: OtherConstruct scope: size=16 alignment=4 sourceRange=0 bytes
+!CHECK: omp_in size=4 offset=0: ObjectEntity type: INTEGER(4)
+!CHECK: omp_orig size=4 offset=4: ObjectEntity type: INTEGER(4)
+!CHECK: omp_out size=4 offset=8: ObjectEntity type: INTEGER(4)
+!CHECK: omp_priv size=4 offset=12: ObjectEntity type: INTEGER(4)
+    !$omp declare reduction (rr : integer : omp_out = omp_out + omp_in) initializer (omp_priv = 0)
+    reduction = .false.
+  end function reduction
+  
+end module mm
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-logical.f90 b/flang/test/Semantics/OpenMP/declare-reduction-logical.f90
new file mode 100644
index 0000000..7ab7cad
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/declare-reduction-logical.f90
@@ -0,0 +1,32 @@
+! RUN: %flang_fc1 -fdebug-dump-symbols -fopenmp -fopenmp-version=50 %s | FileCheck %s
+
+module mm
+  implicit none
+  type logicalwrapper
+     logical b
+  end type logicalwrapper
+
+contains
+!CHECK-LABEL: Subprogram scope: func
+  function func(x, n)
+    logical func
+    integer :: n
+    type(logicalwrapper) ::  x(n)
+    type(logicalwrapper) :: res
+    integer :: i
+    !$omp declare reduction(.AND.:type(logicalwrapper):omp_out%b=omp_out%b .AND. omp_in%b) initializer(omp_priv%b=.true.)
+!CHECK: op.AND: UserReductionDetails TYPE(logicalwrapper)
+!CHECK OtherConstruct scope
+!CHECK: omp_in size=4 offset=0: ObjectEntity type: TYPE(logicalwrapper)
+!CHECK: omp_orig size=4 offset=4: ObjectEntity type: TYPE(logicalwrapper)
+!CHECK: omp_out size=4 offset=8: ObjectEntity type: TYPE(logicalwrapper)
+!CHECK: omp_priv size=4 offset=12: ObjectEntity type: TYPE(logicalwrapper)
+  
+    !$omp simd reduction(.AND.:res)
+    do i=1,n
+       res%b=res%b .and. x(i)%b
+    enddo
+    
+    func=res%b
+  end function func
+end module mm
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-mangled.f90 b/flang/test/Semantics/OpenMP/declare-reduction-mangled.f90
new file mode 100644
index 0000000..9d0a097
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/declare-reduction-mangled.f90
@@ -0,0 +1,51 @@
+! RUN: %flang_fc1 -fdebug-dump-symbols -fopenmp -fopenmp-version=50 %s | FileCheck %s
+
+!! Test that the name mangling for min & max (also used for iand, ieor and ior).
+module mymod
+  type :: tt
+     real r
+  end type tt
+contains
+  function mymax(a, b)
+    type(tt) :: a, b, mymax
+    if (a%r > b%r) then
+       mymax = a
+    else
+       mymax = b
+    end if
+  end function mymax
+end module mymod
+
+program omp_examples
+!CHECK-LABEL: MainProgram scope: omp_examples
+  use mymod
+  implicit none
+  integer, parameter :: n = 100
+  integer :: i
+  type(tt) :: values(n), big, small
+
+  !$omp declare reduction(max:tt:omp_out = mymax(omp_out, omp_in)) initializer(omp_priv%r = 0)
+  !$omp declare reduction(min:tt:omp_out%r = min(omp_out%r, omp_in%r)) initializer(omp_priv%r = 1)
+
+!CHECK: min, ELEMENTAL, INTRINSIC, PURE (Function): ProcEntity
+!CHECK: mymax (Function): Use from mymax in mymod
+!CHECK: op.max: UserReductionDetails TYPE(tt)
+!CHECK: op.min: UserReductionDetails TYPE(tt)
+
+  big%r = 0
+  !$omp parallel do reduction(max:big)
+!CHECK: big (OmpReduction, OmpExplicit): HostAssoc
+!CHECK: max, INTRINSIC: ProcEntity  
+  do i = 1, n
+     big = mymax(values(i), big)
+  end do
+
+  small%r = 1
+  !$omp parallel do reduction(min:small)
+!CHECK: small (OmpReduction, OmpExplicit): HostAssoc
+  do i = 1, n
+     small%r = min(values(i)%r, small%r)
+  end do
+  
+  print *, "small=", small%r, " big=", big%r
+end program omp_examples
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-modfile.f90 b/flang/test/Semantics/OpenMP/declare-reduction-modfile.f90
new file mode 100644
index 0000000..f80eb10
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/declare-reduction-modfile.f90
@@ -0,0 +1,65 @@
+! RUN: %python %S/../test_modfile.py %s %flang_fc1 -fopenmp -fopenmp-version=52
+! Check correct modfile generation for OpenMP DECLARE REDUCTION construct.
+
+!Expect: drm.mod
+!module drm
+!type::t1
+!integer(4)::val
+!endtype
+!!$OMP DECLARE REDUCTION (*:t1:omp_out = omp_out*omp_in) INITIALIZER(omp_priv=t&
+!!$OMP&1(1))
+!!$OMP METADIRECTIVE OTHERWISE(DECLARE REDUCTION(+:INTEGER))
+!!$OMP DECLARE REDUCTION (.fluffy.:t1:omp_out = omp_out.fluffy.omp_in) INITIALI&
+!!$OMP&ZER(omp_priv=t1(0))
+!!$OMP DECLARE REDUCTION (.mul.:t1:omp_out = omp_out.mul.omp_in) INITIALIZER(om&
+!!$OMP&p_priv=t1(1))
+!interface operator(.mul.)
+!procedure::mul
+!end interface
+!interface operator(.fluffy.)
+!procedure::add
+!end interface
+!interface operator(*)
+!procedure::mul
+!end interface
+!contains
+!function mul(v1,v2)
+!type(t1),intent(in)::v1
+!type(t1),intent(in)::v2
+!type(t1)::mul
+!end
+!function add(v1,v2)
+!type(t1),intent(in)::v1
+!type(t1),intent(in)::v2
+!type(t1)::add
+!end
+!end
+
+module drm
+  type t1
+    integer :: val
+  end type t1
+  interface operator(.mul.)
+    procedure mul
+  end interface
+  interface operator(.fluffy.)
+    procedure add
+  end interface
+  interface operator(*)
+    module procedure mul
+  end interface
+!$omp declare reduction(*:t1:omp_out=omp_out*omp_in) initializer(omp_priv=t1(1))
+!$omp declare reduction(.mul.:t1:omp_out=omp_out.mul.omp_in) initializer(omp_priv=t1(1))
+!$omp declare reduction(.fluffy.:t1:omp_out=omp_out.fluffy.omp_in) initializer(omp_priv=t1(0))
+!$omp metadirective otherwise(declare reduction(+: integer))
+contains
+  type(t1) function mul(v1, v2)
+    type(t1), intent (in):: v1, v2
+    mul%val = v1%val * v2%val
+  end function
+  type(t1) function add(v1, v2)
+    type(t1), intent (in):: v1, v2
+    add%val = v1%val + v2%val
+  end function
+end module drm
+
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-operator.f90 b/flang/test/Semantics/OpenMP/declare-reduction-operator.f90
new file mode 100644
index 0000000..dc12332
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/declare-reduction-operator.f90
@@ -0,0 +1,36 @@
+! RUN: %flang_fc1 -fdebug-dump-symbols -fopenmp -fopenmp-version=50 %s | FileCheck %s
+
+module m1
+  interface operator(.fluffy.)
+!CHECK: .fluffy., PUBLIC (Function): Generic DefinedOp procs: my_mul
+    procedure my_mul
+  end interface
+  type t1
+    integer :: val = 1
+  end type
+!$omp declare reduction(.fluffy.:t1:omp_out=omp_out.fluffy.omp_in)
+!CHECK: op.fluffy., PUBLIC: UserReductionDetails TYPE(t1)
+!CHECK: t1, PUBLIC: DerivedType components: val
+!CHECK: OtherConstruct scope: size=16 alignment=4 sourceRange=0 bytes
+!CHECK: omp_in size=4 offset=0: ObjectEntity type: TYPE(t1)
+!CHECK: omp_orig size=4 offset=4: ObjectEntity type: TYPE(t1)
+!CHECK: omp_out size=4 offset=8: ObjectEntity type: TYPE(t1)
+!CHECK: omp_priv size=4 offset=12: ObjectEntity type: TYPE(t1)
+contains
+  function my_mul(x, y)
+    type (t1), intent (in) :: x, y
+    type (t1) :: my_mul
+    my_mul%val = x%val * y%val
+  end function my_mul
+
+  subroutine subr(a, r)
+    implicit none
+    type(t1), intent(in), dimension(10) :: a
+    type(t1), intent(out) :: r
+    integer :: i
+    !$omp parallel do reduction(.fluffy.:r)
+    do i=1,10
+       r = r .fluffy. a(i)
+    end do
+  end subroutine subr
+end module m1
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-operators.f90 b/flang/test/Semantics/OpenMP/declare-reduction-operators.f90
new file mode 100644
index 0000000..d7a9f2f
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/declare-reduction-operators.f90
@@ -0,0 +1,84 @@
+! RUN: %flang_fc1 -fdebug-dump-symbols -fopenmp -fopenmp-version=50 %s | FileCheck %s
+
+module vector_mod
+  implicit none
+  type :: Vector
+    real :: x, y, z
+  contains
+    procedure :: add_vectors
+    generic :: operator(+) => add_vectors
+  end type Vector
+contains
+  ! Function implementing vector addition
+  function add_vectors(a, b) result(res)
+    class(Vector), intent(in) :: a, b
+    type(Vector) :: res
+    res%x = a%x + b%x
+    res%y = a%y + b%y
+    res%z = a%z + b%z
+  end function add_vectors
+end module vector_mod
+
+!! Test user-defined operators. Two different varieties, using conventional and
+!! unconventional names.
+module m1
+  interface operator(.mul.)
+    procedure my_mul
+  end interface
+  interface operator(.fluffy.)
+    procedure my_add
+  end interface
+  type t1
+    integer :: val = 1
+  end type
+!$omp declare reduction(.mul.:t1:omp_out=omp_out.mul.omp_in)
+!$omp declare reduction(.fluffy.:t1:omp_out=omp_out.fluffy.omp_in)
+!CHECK: op.fluffy., PUBLIC: UserReductionDetails TYPE(t1)
+!CHECK: op.mul., PUBLIC: UserReductionDetails TYPE(t1)   
+contains
+  function my_mul(x, y)
+    type (t1), intent (in) :: x, y
+    type (t1) :: my_mul
+    my_mul%val = x%val * y%val
+  end function
+  function my_add(x, y)
+    type (t1), intent (in) :: x, y
+    type (t1) :: my_add
+    my_add%val = x%val + y%val
+  end function
+end module m1
+
+program test_vector
+!CHECK-LABEL: MainProgram scope: test_vector
+  use vector_mod
+!CHECK: add_vectors (Function): Use from add_vectors in vector_mod
+  implicit none
+  integer :: i
+  type(Vector) :: v1(100), v2(100)
+
+  !$OMP declare reduction(+:vector:omp_out=omp_out+omp_in) initializer(omp_priv=Vector(0,0,0))
+!CHECK: op.+: UserReductionDetails TYPE(vector)
+!CHECK: v1 size=1200 offset=4: ObjectEntity type: TYPE(vector) shape: 1_8:100_8
+!CHECK: v2 size=1200 offset=1204: ObjectEntity type: TYPE(vector) shape: 1_8:100_8
+!CHECK: vector: Use from vector in vector_mod
+
+!CHECK: OtherConstruct scope:
+!CHECK: omp_in size=12 offset=0: ObjectEntity type: TYPE(vector)
+!CHECK: omp_orig size=12 offset=12: ObjectEntity type: TYPE(vector)
+!CHECK: omp_out size=12 offset=24: ObjectEntity type: TYPE(vector)
+!CHECK: omp_priv size=12 offset=36: ObjectEntity type: TYPE(vector)
+
+  v2 = Vector(0.0, 0.0, 0.0)
+  v1 = Vector(1.0, 2.0, 3.0)
+  !$OMP parallel do reduction(+:v2)
+!CHECK: OtherConstruct scope
+!CHECK: i (OmpPrivate, OmpPreDetermined): HostAssoc
+!CHECK: v1 (OmpShared): HostAssoc
+!CHECK: v2 (OmpReduction, OmpExplicit): HostAssoc
+
+  do i = 1, 100
+     v2(i) = v2(i) + v1(i)  ! Invokes add_vectors
+  end do
+  
+  print *, 'v2 components:', v2%x, v2%y, v2%z
+end program test_vector
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90 b/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90
new file mode 100644
index 0000000..12e80cb
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90
@@ -0,0 +1,47 @@
+! RUN: %flang_fc1 -fdebug-dump-symbols -fopenmp -fopenmp-version=50 %s | FileCheck %s
+
+!! Test that we can "rename" an operator when using a module's operator.
+module module1
+!CHECK:  Module scope: module1 size=0
+  implicit none
+  type :: t1
+     real :: value
+  end type t1
+  interface operator(.mul.)
+     module procedure my_mul
+  end interface operator(.mul.)
+!CHECK: .mul., PUBLIC (Function): Generic DefinedOp procs: my_mul
+!CHECK: my_mul, PUBLIC (Function): Subprogram result:TYPE(t1) r (TYPE(t1) x,TYPE(t1)
+!CHECK: t1, PUBLIC: DerivedType components: value
+contains
+    function my_mul(x, y) result(r)
+      type(t1), intent(in) :: x, y
+      type(t1) :: r
+      r%value = x%value * y%value
+    end function my_mul
+end module module1
+
+program test_omp_reduction
+!CHECK: MainProgram scope: test_omp_reduction
+  use module1, only: t1, operator(.modmul.) => operator(.mul.)
+
+!CHECK: .modmul. (Function): Use from .mul. in module1
+  implicit none
+
+  type(t1) :: result
+  integer :: i
+  !$omp declare reduction (.modmul. : t1 : omp_out = omp_out .modmul. omp_in) initializer(omp_priv = t1(1.0))
+!CHECK: op.modmul.: UserReductionDetails TYPE(t1)
+!CHECK: t1: Use from t1 in module1
+!CHECK: OtherConstruct scope: size=16 alignment=4 sourceRange=0 bytes
+!CHECK: omp_in size=4 offset=0: ObjectEntity type: TYPE(t1)
+!CHECK: omp_orig size=4 offset=4: ObjectEntity type: TYPE(t1)
+!CHECK: omp_out size=4 offset=8: ObjectEntity type: TYPE(t1)
+!CHECK: omp_priv size=4 offset=12: ObjectEntity type: TYPE(t1)
+  result = t1(1.0)
+  !$omp parallel do reduction(.modmul.:result)
+  do i = 1, 10
+     result = result .modmul. t1(real(i))
+  end do
+  !$omp end parallel do
+end program test_omp_reduction
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-typeerror.f90 b/flang/test/Semantics/OpenMP/declare-reduction-typeerror.f90
new file mode 100644
index 0000000..b8ede55
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/declare-reduction-typeerror.f90
@@ -0,0 +1,34 @@
+! RUN: not %flang_fc1 -fdebug-dump-symbols -fopenmp -fopenmp-version=50 %s 2>&1 | FileCheck %s
+
+module mm
+  implicit none
+  type two
+     integer(4) :: a, b
+  end type two
+
+  type three
+     integer(8) :: a, b, c
+  end type three
+contains
+  function add_two(x, y)
+    type(two) add_two, x, y, res
+    add_two = res
+  end function add_two
+
+  function func(n)
+    type(three) :: func
+    type(three) :: res3
+    integer :: n
+    integer :: i
+
+    !$omp declare reduction(dummy:kerflunk:omp_out=omp_out+omp_in)
+!CHECK: error: Derived type 'kerflunk' not found
+    
+    !$omp declare reduction(adder:two:omp_out=add_two(omp_out,omp_in))
+    !$omp simd reduction(adder:res3)
+!CHECK: error: The type of 'res3' is incompatible with the reduction operator.
+    do i=1,n
+    enddo
+    func = res3
+  end function func
+end module mm
diff --git a/flang/test/Semantics/OpenMP/declare-reduction.f90 b/flang/test/Semantics/OpenMP/declare-reduction.f90
index 11612f0..ddca38f 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction.f90
@@ -17,7 +17,7 @@ function func(x, n, init)
      end subroutine initme
   end interface
   !$omp declare reduction(red_add:integer(4):omp_out=omp_out+omp_in) initializer(initme(omp_priv,0))
-!CHECK: red_add: Misc ConstructName
+!CHECK: red_add: UserReductionDetails
 !CHECK: Subprogram scope: initme
 !CHECK: omp_in size=4 offset=0: ObjectEntity type: INTEGER(4)
 !CHECK: omp_orig size=4 offset=4: ObjectEntity type: INTEGER(4)
@@ -35,7 +35,7 @@ program main
 
   !$omp declare reduction (my_add_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv=0)
 
-!CHECK: my_add_red: Misc ConstructName
+!CHECK: my_add_red: UserReductionDetails
 !CHECK: omp_in size=4 offset=0: ObjectEntity type: INTEGER(4)
 !CHECK: omp_orig size=4 offset=4: ObjectEntity type: INTEGER(4)
 !CHECK: omp_out size=4 offset=8: ObjectEntity type: INTEGER(4)
diff --git a/flang/test/Semantics/OpenMP/declare-target-common-block.f90 b/flang/test/Semantics/OpenMP/declare-target-common-block.f90
index 33a093a..ccdedaa 100644
--- a/flang/test/Semantics/OpenMP/declare-target-common-block.f90
+++ b/flang/test/Semantics/OpenMP/declare-target-common-block.f90
@@ -1,8 +1,8 @@
 ! RUN: %flang_fc1 -fopenmp -fdebug-dump-symbols %s | FileCheck %s
 
 PROGRAM main
-    !CHECK: one (OmpDeclareTarget) size=4 offset=0: ObjectEntity type: REAL(4)
-    !CHECK: two (OmpDeclareTarget) size=4 offset=4: ObjectEntity type: REAL(4)
+    !CHECK: one (InCommonBlock, OmpDeclareTarget) size=4 offset=0: ObjectEntity type: REAL(4)
+    !CHECK: two (InCommonBlock, OmpDeclareTarget) size=4 offset=4: ObjectEntity type: REAL(4)
     !CHECK: numbers size=8 offset=0: CommonBlockDetails alignment=4: one two
     REAL :: one, two
     COMMON /numbers/ one, two
diff --git a/flang/test/Semantics/OpenMP/implicit-dsa.f90 b/flang/test/Semantics/OpenMP/implicit-dsa.f90
index 7e38435..3e93485 100644
--- a/flang/test/Semantics/OpenMP/implicit-dsa.f90
+++ b/flang/test/Semantics/OpenMP/implicit-dsa.f90
@@ -169,3 +169,78 @@ subroutine implicit_dsa_test8
     end do
   !$omp end task
 end subroutine
+
+! Test variables defined in modules default to shared DSA
+!DEF: /implicit_dsa_test9_mod Module
+module implicit_dsa_test9_mod
+ !DEF: /implicit_dsa_test9_mod/tm3a PUBLIC (InDataStmt) ObjectEntity COMPLEX(4)
+  complex tm3a/(0,0)/
+ !DEF: /implicit_dsa_test9_mod/tm4a PUBLIC ObjectEntity COMPLEX(4)
+  complex tm4a
+contains
+ !DEF: /implicit_dsa_test9_mod/implict_dsa_test9 PUBLIC (Subroutine) Subprogram
+  subroutine implict_dsa_test9
+    !$omp task
+      !$omp task
+        !DEF: /implicit_dsa_test9_mod/implict_dsa_test9/OtherConstruct1/OtherConstruct1/tm3a (OmpShared) HostAssoc COMPLEX(4)
+        tm3a = (1, 2)
+        !DEF: /implicit_dsa_test9_mod/implict_dsa_test9/OtherConstruct1/OtherConstruct1/tm4a (OmpShared) HostAssoc COMPLEX(4)
+        tm4a = (3, 4)
+      !$omp end task
+    !$omp end task
+  !$omp taskwait
+  !REF: /implicit_dsa_test9_mod/tm3a
+  print *,tm3a
+  end subroutine
+end module
+
+! Test variables in data statement default to shared DSA
+!DEF: /implicit_dsa_test10 (Subroutine) Subprogram
+subroutine implicit_dsa_test10
+ !DEF: /implicit_dsa_test10/tm3a (Implicit, InDataStmt) ObjectEntity REAL(4)
+data tm3a /3/
+!$omp task
+  !$omp task
+ !DEF: /implicit_dsa_test10/OtherConstruct1/OtherConstruct1/tm3a (OmpShared) HostAssoc REAL(4)
+    tm3a = 5
+  !$omp end task
+!$omp end task
+!$omp taskwait
+ !REF: /implicit_dsa_test10/tm3a
+print *,tm3a
+end subroutine
+
+! Test variables with the SAVE attrtibute default to shared DSA
+!DEF: /implicit_dsa_test_11 (Subroutine) Subprogram
+subroutine implicit_dsa_test_11
+ !DEF: /implicit_dsa_test_11/tm3a SAVE ObjectEntity COMPLEX(4)
+complex, save :: tm3a
+!$omp task
+  !$omp task
+    !DEF: /implicit_dsa_test_11/OtherConstruct1/OtherConstruct1/tm3a (OmpShared) HostAssoc COMPLEX(4)
+    tm3a = (1, 2)
+  !$omp end task
+!$omp end task
+!$omp taskwait
+!REF: /implicit_dsa_test_11/tm3a
+print *,tm3a
+end subroutine
+
+! Test variables referenced in a common block default to shared DSA
+!DEF: /implicit_dsa_test_12 (Subroutine) Subprogram
+subroutine implicit_dsa_test_12
+ !DEF: /implicit_dsa_test_12/tm3a (InCommonBlock) ObjectEntity COMPLEX(4)
+complex tm3a
+ !DEF: /implicit_dsa_test_12/tcom CommonBlockDetails
+ !REF: /implicit_dsa_test_12/tm3a
+common /tcom/ tm3a
+!$omp task
+  !$omp task
+    !DEF: /implicit_dsa_test_12/OtherConstruct1/OtherConstruct1/tm3a (OmpShared) HostAssoc COMPLEX(4)
+    tm3a = (1, 2)
+  !$omp end task
+!$omp end task
+!$omp taskwait
+!REF: /implicit_dsa_test_12/tm3a
+print *,tm3a
+end subroutine
diff --git a/flang/test/Semantics/OpenMP/symbol01.f90 b/flang/test/Semantics/OpenMP/symbol01.f90
index 595b6b8..fbd9a02 100644
--- a/flang/test/Semantics/OpenMP/symbol01.f90
+++ b/flang/test/Semantics/OpenMP/symbol01.f90
@@ -21,8 +21,8 @@ program mm
  !REF: /md
  use :: md
  !DEF: /mm/c CommonBlockDetails
- !DEF: /mm/x ObjectEntity REAL(4)
- !DEF: /mm/y ObjectEntity REAL(4)
+ !DEF: /mm/x (InCommonBlock) ObjectEntity REAL(4)
+ !DEF: /mm/y (InCommonBlock) ObjectEntity REAL(4)
  common /c/x, y
  !REF: /mm/x
  !REF: /mm/y
diff --git a/flang/test/Semantics/offsets03.f90 b/flang/test/Semantics/offsets03.f90
index c8c1abe..75bc43d 100644
--- a/flang/test/Semantics/offsets03.f90
+++ b/flang/test/Semantics/offsets03.f90
@@ -30,10 +30,10 @@ end
 ! Common block: objects are in order from COMMON statement and not part of module
 module md                   !CHECK: Module scope: md size=1 alignment=1
   integer(1) :: i
-  integer(2) :: d1          !CHECK: d1, PUBLIC size=2 offset=8:
-  integer(4) :: d2          !CHECK: d2, PUBLIC size=4 offset=4:
-  integer(1) :: d3          !CHECK: d3, PUBLIC size=1 offset=0:
-  real(2) :: d4             !CHECK: d4, PUBLIC size=2 offset=0:
+  integer(2) :: d1          !CHECK: d1, PUBLIC (InCommonBlock) size=2 offset=8:
+  integer(4) :: d2          !CHECK: d2, PUBLIC (InCommonBlock) size=4 offset=4:
+  integer(1) :: d3          !CHECK: d3, PUBLIC (InCommonBlock) size=1 offset=0:
+  real(2) :: d4             !CHECK: d4, PUBLIC (InCommonBlock) size=2 offset=0:
   common /common1/ d3,d2,d1 !CHECK: common1 size=10 offset=0: CommonBlockDetails alignment=4:
   common /common2/ d4       !CHECK: common2 size=2 offset=0: CommonBlockDetails alignment=2:
 end
@@ -71,7 +71,7 @@ end
 subroutine host1
  contains
   subroutine internal
-    common /b/ x(4)  ! CHECK: x (Implicit) size=16 offset=0: ObjectEntity type: REAL(4) shape: 1_8:4_8
+    common /b/ x(4)  ! CHECK: x (Implicit, InCommonBlock) size=16 offset=0: ObjectEntity type: REAL(4) shape: 1_8:4_8
     equivalence(x,y) ! CHECK: y (Implicit) size=4 offset=0: ObjectEntity type: REAL(4)
   end
 end
diff --git a/flang/test/Semantics/resolve121.f90 b/flang/test/Semantics/resolve121.f90
index d84bc53..fa54a09 100644
--- a/flang/test/Semantics/resolve121.f90
+++ b/flang/test/Semantics/resolve121.f90
@@ -25,7 +25,7 @@ subroutine test3()
   ! CHECK-LABEL: Subprogram scope: test3
   ! CHECK: i1, SAVE size=4 offset=0: ObjectEntity type: INTEGER(4)
   ! CHECK: j1, SAVE size=4 offset=0: ObjectEntity type: INTEGER(4)
-  ! CHECK: k1, SAVE size=4 offset=0: ObjectEntity type: INTEGER(4)
+  ! CHECK: k1, SAVE (InCommonBlock) size=4 offset=0: ObjectEntity type: INTEGER(4)
   integer :: i1
   integer :: j1, k1
   common /blk/ k1
@@ -37,7 +37,7 @@ subroutine test4()
   ! CHECK-LABEL: Subprogram scope: test4
   ! CHECK: i1, SAVE size=4 offset=0: ObjectEntity type: INTEGER(4) init:1_4
   ! CHECK: j1, SAVE size=4 offset=0: ObjectEntity type: INTEGER(4)
-  ! CHECK: k1, SAVE size=4 offset=0: ObjectEntity type: INTEGER(4)
+  ! CHECK: k1, SAVE (InCommonBlock) size=4 offset=0: ObjectEntity type: INTEGER(4)
   integer :: i1 = 1
   integer :: j1, k1
   common /blk/ k1
diff --git a/flang/test/Semantics/symbol33.f90 b/flang/test/Semantics/symbol33.f90
index fbb5321..ccc52b6 100644
--- a/flang/test/Semantics/symbol33.f90
+++ b/flang/test/Semantics/symbol33.f90
@@ -3,7 +3,7 @@
 ! array element reference still applies implicit typing, &c.
 !DEF: /subr (Subroutine) Subprogram
 subroutine subr
-  !DEF: /subr/moo (Implicit) ObjectEntity INTEGER(4)
+  !DEF: /subr/moo (Implicit, InCommonBlock) ObjectEntity INTEGER(4)
   common //moo(1)
   !DEF: /subr/a ObjectEntity REAL(4)
   !REF: /subr/moo
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 43cefd5..52611e4 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -1706,6 +1706,11 @@ add_dependencies(cxx-headers generate-cxx-headers)
 target_include_directories(cxx-headers INTERFACE ${LIBCXX_GENERATED_INCLUDE_TARGET_DIR}
                                                  ${LIBCXX_GENERATED_INCLUDE_DIR})
 
+# Make sure to map the generated include directory back to libc++'s actual source directory when generating
+# debug information. Otherwise, the debug information will refer to generated headers which are created during
+# the build and generally not persistent.
+target_add_compile_flags_if_supported(cxx-headers INTERFACE "-fdebug-prefix-map=${LIBCXX_GENERATED_INCLUDE_DIR}=${LIBCXX_SOURCE_DIR}/include")
+
 if (LIBCXX_INSTALL_HEADERS)
   foreach(file ${files})
     get_filename_component(dir ${file} DIRECTORY)
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 110450f..38c47e8 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -319,37 +319,12 @@ typedef __char32_t char32_t;
 
 #  define _LIBCPP_PREFERRED_ALIGNOF(_Tp) __alignof(_Tp)
 
-// Objective-C++ features (opt-in)
-#  if __has_feature(objc_arc)
-#    define _LIBCPP_HAS_OBJC_ARC 1
-#  else
-#    define _LIBCPP_HAS_OBJC_ARC 0
-#  endif
-
-#  if __has_feature(objc_arc_weak)
-#    define _LIBCPP_HAS_OBJC_ARC_WEAK 1
-#  else
-#    define _LIBCPP_HAS_OBJC_ARC_WEAK 0
-#  endif
-
-#  if __has_extension(blocks)
-#    define _LIBCPP_HAS_EXTENSION_BLOCKS 1
-#  else
-#    define _LIBCPP_HAS_EXTENSION_BLOCKS 0
-#  endif
-
-#  if _LIBCPP_HAS_EXTENSION_BLOCKS && defined(__APPLE__)
+#  if __has_extension(blocks) && defined(__APPLE__)
 #    define _LIBCPP_HAS_BLOCKS_RUNTIME 1
 #  else
 #    define _LIBCPP_HAS_BLOCKS_RUNTIME 0
 #  endif
 
-#  if __has_feature(address_sanitizer)
-#    define _LIBCPP_HAS_ASAN 1
-#  else
-#    define _LIBCPP_HAS_ASAN 0
-#  endif
-
 #  define _LIBCPP_ALWAYS_INLINE __attribute__((__always_inline__))
 
 #  if defined(_LIBCPP_OBJECT_FORMAT_COFF)
diff --git a/libcxx/include/__debug_utils/sanitizers.h b/libcxx/include/__debug_utils/sanitizers.h
index 73d1927..058feab 100644
--- a/libcxx/include/__debug_utils/sanitizers.h
+++ b/libcxx/include/__debug_utils/sanitizers.h
@@ -17,7 +17,7 @@
 #  pragma GCC system_header
 #endif
 
-#if _LIBCPP_HAS_ASAN
+#if __has_feature(address_sanitizer)
 
 extern "C" {
 _LIBCPP_EXPORTED_FROM_ABI void
@@ -28,12 +28,12 @@ _LIBCPP_EXPORTED_FROM_ABI int
 __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, const void*, const void*);
 }
 
-#endif // _LIBCPP_HAS_ASAN
+#endif // __has_feature(address_sanitizer)
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 // ASan choices
-#if _LIBCPP_HAS_ASAN
+#if __has_feature(address_sanitizer)
 #  define _LIBCPP_HAS_ASAN_CONTAINER_ANNOTATIONS_FOR_ALL_ALLOCATORS 1
 #endif
 
@@ -57,7 +57,7 @@ _LIBCPP_HIDE_FROM_ABI void __annotate_double_ended_contiguous_container(
     const void* __last_old_contained,
     const void* __first_new_contained,
     const void* __last_new_contained) {
-#if !_LIBCPP_HAS_ASAN
+#if !__has_feature(address_sanitizer)
   (void)__first_storage;
   (void)__last_storage;
   (void)__first_old_contained;
@@ -86,7 +86,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void __annotate_contiguous_c
     const void* __last_storage,
     const void* __old_last_contained,
     const void* __new_last_contained) {
-#if !_LIBCPP_HAS_ASAN
+#if !__has_feature(address_sanitizer)
   (void)__first_storage;
   (void)__last_storage;
   (void)__old_last_contained;
diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h
index e33c5ab..733f321 100644
--- a/libcxx/include/__functional/function.h
+++ b/libcxx/include/__functional/function.h
@@ -122,7 +122,7 @@ _LIBCPP_HIDE_FROM_ABI bool __not_null(function<_Fp> const& __f) {
   return !!__f;
 }
 
-#  if _LIBCPP_HAS_EXTENSION_BLOCKS
+#  if __has_extension(blocks)
 template <class _Rp, class... _Args>
 _LIBCPP_HIDE_FROM_ABI bool __not_null(_Rp (^__p)(_Args...)) {
   return __p;
@@ -757,7 +757,7 @@ class __func<_Rp1 (^)(_ArgTypes1...), _Alloc, _Rp(_ArgTypes...)> : public __base
 
 public:
   _LIBCPP_HIDE_FROM_ABI explicit __func(__block_type const& __f)
-#    if _LIBCPP_HAS_OBJC_ARC
+#    if __has_feature(objc_arc)
       : __f_(__f)
 #    else
       : __f_(reinterpret_cast<__block_type>(__f ? _Block_copy(__f) : nullptr))
@@ -768,7 +768,7 @@ public:
   // [TODO] add && to save on a retain
 
   _LIBCPP_HIDE_FROM_ABI explicit __func(__block_type __f, const _Alloc& /* unused */)
-#    if _LIBCPP_HAS_OBJC_ARC
+#    if __has_feature(objc_arc)
       : __f_(__f)
 #    else
       : __f_(reinterpret_cast<__block_type>(__f ? _Block_copy(__f) : nullptr))
@@ -790,7 +790,7 @@ public:
   }
 
   _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual void destroy() _NOEXCEPT {
-#    if !_LIBCPP_HAS_OBJC_ARC
+#    if !__has_feature(objc_arc)
     if (__f_)
       _Block_release(__f_);
 #    endif
@@ -822,7 +822,7 @@ public:
 #    endif // _LIBCPP_HAS_RTTI
 };
 
-#  endif // _LIBCPP_HAS_EXTENSION_BLOCKS
+#  endif // _LIBCPP_HAS_BLOCKS_RUNTIME
 
 } // namespace __function
 
diff --git a/libcxx/include/__functional/hash.h b/libcxx/include/__functional/hash.h
index f9f7d2c..489a6f0 100644
--- a/libcxx/include/__functional/hash.h
+++ b/libcxx/include/__functional/hash.h
@@ -19,6 +19,8 @@
 #include <__type_traits/invoke.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_enum.h>
+#include <__type_traits/is_floating_point.h>
+#include <__type_traits/is_integral.h>
 #include <__type_traits/underlying_type.h>
 #include <__utility/pair.h>
 #include <__utility/swap.h>
@@ -345,122 +347,43 @@ struct hash<_Tp*> : public __unary_function<_Tp*, size_t> {
   }
 };
 
-template <>
-struct hash<bool> : public __unary_function<bool, size_t> {
-  _LIBCPP_HIDE_FROM_ABI size_t operator()(bool __v) const _NOEXCEPT { return static_cast<size_t>(__v); }
-};
-
-template <>
-struct hash<char> : public __unary_function<char, size_t> {
-  _LIBCPP_HIDE_FROM_ABI size_t operator()(char __v) const _NOEXCEPT { return static_cast<size_t>(__v); }
-};
-
-template <>
-struct hash<signed char> : public __unary_function<signed char, size_t> {
-  _LIBCPP_HIDE_FROM_ABI size_t operator()(signed char __v) const _NOEXCEPT { return static_cast<size_t>(__v); }
-};
-
-template <>
-struct hash<unsigned char> : public __unary_function<unsigned char, size_t> {
-  _LIBCPP_HIDE_FROM_ABI size_t operator()(unsigned char __v) const _NOEXCEPT { return static_cast<size_t>(__v); }
-};
-
-#if _LIBCPP_HAS_CHAR8_T
-template <>
-struct hash<char8_t> : public __unary_function<char8_t, size_t> {
-  _LIBCPP_HIDE_FROM_ABI size_t operator()(char8_t __v) const _NOEXCEPT { return static_cast<size_t>(__v); }
-};
-#endif // _LIBCPP_HAS_CHAR8_T
-
-template <>
-struct hash<char16_t> : public __unary_function<char16_t, size_t> {
-  _LIBCPP_HIDE_FROM_ABI size_t operator()(char16_t __v) const _NOEXCEPT { return static_cast<size_t>(__v); }
-};
-
-template <>
-struct hash<char32_t> : public __unary_function<char32_t, size_t> {
-  _LIBCPP_HIDE_FROM_ABI size_t operator()(char32_t __v) const _NOEXCEPT { return static_cast<size_t>(__v); }
-};
-
-#if _LIBCPP_HAS_WIDE_CHARACTERS
-template <>
-struct hash<wchar_t> : public __unary_function<wchar_t, size_t> {
-  _LIBCPP_HIDE_FROM_ABI size_t operator()(wchar_t __v) const _NOEXCEPT { return static_cast<size_t>(__v); }
-};
-#endif // _LIBCPP_HAS_WIDE_CHARACTERS
-
-template <>
-struct hash<short> : public __unary_function<short, size_t> {
-  _LIBCPP_HIDE_FROM_ABI size_t operator()(short __v) const _NOEXCEPT { return static_cast<size_t>(__v); }
-};
-
-template <>
-struct hash<unsigned short> : public __unary_function<unsigned short, size_t> {
-  _LIBCPP_HIDE_FROM_ABI size_t operator()(unsigned short __v) const _NOEXCEPT { return static_cast<size_t>(__v); }
+template <class _Tp, class = void>
+struct __hash_impl {
+  __hash_impl()                              = delete;
+  __hash_impl(__hash_impl const&)            = delete;
+  __hash_impl& operator=(__hash_impl const&) = delete;
 };
 
-template <>
-struct hash<int> : public __unary_function<int, size_t> {
-  _LIBCPP_HIDE_FROM_ABI size_t operator()(int __v) const _NOEXCEPT { return static_cast<size_t>(__v); }
-};
-
-template <>
-struct hash<unsigned int> : public __unary_function<unsigned int, size_t> {
-  _LIBCPP_HIDE_FROM_ABI size_t operator()(unsigned int __v) const _NOEXCEPT { return static_cast<size_t>(__v); }
-};
-
-template <>
-struct hash<long> : public __unary_function<long, size_t> {
-  _LIBCPP_HIDE_FROM_ABI size_t operator()(long __v) const _NOEXCEPT { return static_cast<size_t>(__v); }
-};
-
-template <>
-struct hash<unsigned long> : public __unary_function<unsigned long, size_t> {
-  _LIBCPP_HIDE_FROM_ABI size_t operator()(unsigned long __v) const _NOEXCEPT {
-    static_assert(sizeof(size_t) >= sizeof(unsigned long),
-                  "This would be a terrible hash function on a platform where size_t is smaller than unsigned long");
-    return static_cast<size_t>(__v);
+template <class _Tp>
+struct __hash_impl<_Tp, __enable_if_t<is_enum<_Tp>::value> > : __unary_function<_Tp, size_t> {
+  _LIBCPP_HIDE_FROM_ABI size_t operator()(_Tp __v) const _NOEXCEPT {
+    using type = __underlying_type_t<_Tp>;
+    return hash<type>()(static_cast<type>(__v));
   }
 };
 
-template <>
-struct hash<long long> : public __scalar_hash<long long> {};
-
-template <>
-struct hash<unsigned long long> : public __scalar_hash<unsigned long long> {};
-
-#if _LIBCPP_HAS_INT128
-
-template <>
-struct hash<__int128_t> : public __scalar_hash<__int128_t> {};
-
-template <>
-struct hash<__uint128_t> : public __scalar_hash<__uint128_t> {};
+template <class _Tp>
+struct __hash_impl<_Tp, __enable_if_t<is_integral<_Tp>::value && (sizeof(_Tp) <= sizeof(size_t))> >
+    : __unary_function<_Tp, size_t> {
+  _LIBCPP_HIDE_FROM_ABI size_t operator()(_Tp __v) const _NOEXCEPT { return static_cast<size_t>(__v); }
+};
 
-#endif
+template <class _Tp>
+struct __hash_impl<_Tp, __enable_if_t<is_integral<_Tp>::value && (sizeof(_Tp) > sizeof(size_t))> >
+    : __scalar_hash<_Tp> {};
 
-template <>
-struct hash<float> : public __scalar_hash<float> {
-  _LIBCPP_HIDE_FROM_ABI size_t operator()(float __v) const _NOEXCEPT {
+template <class _Tp>
+struct __hash_impl<_Tp, __enable_if_t<is_floating_point<_Tp>::value> > : __scalar_hash<_Tp> {
+  _LIBCPP_HIDE_FROM_ABI size_t operator()(_Tp __v) const _NOEXCEPT {
     // -0.0 and 0.0 should return same hash
     if (__v == 0.0f)
       return 0;
-    return __scalar_hash<float>::operator()(__v);
-  }
-};
-
-template <>
-struct hash<double> : public __scalar_hash<double> {
-  _LIBCPP_HIDE_FROM_ABI size_t operator()(double __v) const _NOEXCEPT {
-    // -0.0 and 0.0 should return same hash
-    if (__v == 0.0)
-      return 0;
-    return __scalar_hash<double>::operator()(__v);
+    return __scalar_hash<_Tp>::operator()(__v);
   }
 };
 
 template <>
-struct hash<long double> : public __scalar_hash<long double> {
+struct __hash_impl<long double> : __scalar_hash<long double> {
   _LIBCPP_HIDE_FROM_ABI size_t operator()(long double __v) const _NOEXCEPT {
     // -0.0 and 0.0 should return same hash
     if (__v == 0.0L)
@@ -501,22 +424,8 @@ struct hash<long double> : public __scalar_hash<long double> {
   }
 };
 
-template <class _Tp, bool = is_enum<_Tp>::value>
-struct __enum_hash : public __unary_function<_Tp, size_t> {
-  _LIBCPP_HIDE_FROM_ABI size_t operator()(_Tp __v) const _NOEXCEPT {
-    using type = __underlying_type_t<_Tp>;
-    return hash<type>()(static_cast<type>(__v));
-  }
-};
-template <class _Tp>
-struct __enum_hash<_Tp, false> {
-  __enum_hash()                              = delete;
-  __enum_hash(__enum_hash const&)            = delete;
-  __enum_hash& operator=(__enum_hash const&) = delete;
-};
-
 template <class _Tp>
-struct hash : public __enum_hash<_Tp> {};
+struct hash : public __hash_impl<_Tp> {};
 
 #if _LIBCPP_STD_VER >= 17
 
diff --git a/libcxx/include/__memory/addressof.h b/libcxx/include/__memory/addressof.h
index 98b08958a..667071d 100644
--- a/libcxx/include/__memory/addressof.h
+++ b/libcxx/include/__memory/addressof.h
@@ -23,7 +23,7 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX17 _LIBCPP_NO_CFI _LIBCPP_HIDE_FROM_ABI _Tp* a
   return __builtin_addressof(__x);
 }
 
-#if _LIBCPP_HAS_OBJC_ARC
+#if __has_feature(objc_arc)
 // Objective-C++ Automatic Reference Counting uses qualified pointers
 // that require special addressof() signatures.
 template <class _Tp>
@@ -31,7 +31,7 @@ inline _LIBCPP_HIDE_FROM_ABI __strong _Tp* addressof(__strong _Tp& __x) _NOEXCEP
   return &__x;
 }
 
-#  if _LIBCPP_HAS_OBJC_ARC_WEAK
+#  if __has_feature(objc_arc_weak)
 template <class _Tp>
 inline _LIBCPP_HIDE_FROM_ABI __weak _Tp* addressof(__weak _Tp& __x) _NOEXCEPT {
   return &__x;
diff --git a/libcxx/include/__type_traits/is_pointer.h b/libcxx/include/__type_traits/is_pointer.h
index 7bc78a6..3c58656 100644
--- a/libcxx/include/__type_traits/is_pointer.h
+++ b/libcxx/include/__type_traits/is_pointer.h
@@ -40,7 +40,7 @@ template <class _Tp>
 struct __libcpp_remove_objc_qualifiers {
   typedef _Tp type;
 };
-#  if _LIBCPP_HAS_OBJC_ARC
+#  if __has_feature(objc_arc)
 // clang-format off
 template <class _Tp> struct __libcpp_remove_objc_qualifiers<_Tp __strong> { typedef _Tp type; };
 template <class _Tp> struct __libcpp_remove_objc_qualifiers<_Tp __weak> { typedef _Tp type; };
diff --git a/libcxx/include/__type_traits/is_scalar.h b/libcxx/include/__type_traits/is_scalar.h
index 102a0ca..4cf8c39 100644
--- a/libcxx/include/__type_traits/is_scalar.h
+++ b/libcxx/include/__type_traits/is_scalar.h
@@ -37,7 +37,7 @@ _LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_scalar_v = __is_scalar(_Tp);
 
 template <class _Tp>
 struct __is_block : false_type {};
-#  if _LIBCPP_HAS_EXTENSION_BLOCKS
+#  if __has_extension(blocks)
 template <class _Rp, class... _Args>
 struct __is_block<_Rp (^)(_Args...)> : true_type {};
 #  endif
diff --git a/libcxx/include/deque b/libcxx/include/deque
index 5b9aaf8..e33e7d3 100644
--- a/libcxx/include/deque
+++ b/libcxx/include/deque
@@ -930,7 +930,7 @@ private:
     (void)__end;
     (void)__annotation_type;
     (void)__place;
-#  if _LIBCPP_HAS_ASAN
+#  if __has_feature(address_sanitizer)
     // __beg - index of the first item to annotate
     // __end - index behind the last item to annotate (so last item + 1)
     // __annotation_type - __asan_unposion or __asan_poison
@@ -1023,23 +1023,23 @@ private:
       std::__annotate_double_ended_contiguous_container<_Allocator>(
           __mem_beg, __mem_end, __old_beg, __old_end, __new_beg, __new_end);
     }
-#  endif // _LIBCPP_HAS_ASAN
+#  endif // __has_feature(address_sanitizer)
   }
 
   _LIBCPP_HIDE_FROM_ABI void __annotate_new(size_type __current_size) const _NOEXCEPT {
     (void)__current_size;
-#  if _LIBCPP_HAS_ASAN
+#  if __has_feature(address_sanitizer)
     if (__current_size == 0)
       __annotate_from_to(0, __map_.size() * __block_size, __asan_poison, __asan_back_moved);
     else {
       __annotate_from_to(0, __start_, __asan_poison, __asan_front_moved);
       __annotate_from_to(__start_ + __current_size, __map_.size() * __block_size, __asan_poison, __asan_back_moved);
     }
-#  endif // _LIBCPP_HAS_ASAN
+#  endif // __has_feature(address_sanitizer)
   }
 
   _LIBCPP_HIDE_FROM_ABI void __annotate_delete() const _NOEXCEPT {
-#  if _LIBCPP_HAS_ASAN
+#  if __has_feature(address_sanitizer)
     if (empty()) {
       for (size_t __i = 0; __i < __map_.size(); ++__i) {
         __annotate_whole_block(__i, __asan_unposion);
@@ -1048,19 +1048,19 @@ private:
       __annotate_from_to(0, __start_, __asan_unposion, __asan_front_moved);
       __annotate_from_to(__start_ + size(), __map_.size() * __block_size, __asan_unposion, __asan_back_moved);
     }
-#  endif // _LIBCPP_HAS_ASAN
+#  endif // __has_feature(address_sanitizer)
   }
 
   _LIBCPP_HIDE_FROM_ABI void __annotate_increase_front(size_type __n) const _NOEXCEPT {
     (void)__n;
-#  if _LIBCPP_HAS_ASAN
+#  if __has_feature(address_sanitizer)
     __annotate_from_to(__start_ - __n, __start_, __asan_unposion, __asan_front_moved);
 #  endif
   }
 
   _LIBCPP_HIDE_FROM_ABI void __annotate_increase_back(size_type __n) const _NOEXCEPT {
     (void)__n;
-#  if _LIBCPP_HAS_ASAN
+#  if __has_feature(address_sanitizer)
     __annotate_from_to(__start_ + size(), __start_ + size() + __n, __asan_unposion, __asan_back_moved);
 #  endif
   }
@@ -1068,7 +1068,7 @@ private:
   _LIBCPP_HIDE_FROM_ABI void __annotate_shrink_front(size_type __old_size, size_type __old_start) const _NOEXCEPT {
     (void)__old_size;
     (void)__old_start;
-#  if _LIBCPP_HAS_ASAN
+#  if __has_feature(address_sanitizer)
     __annotate_from_to(__old_start, __old_start + (__old_size - size()), __asan_poison, __asan_front_moved);
 #  endif
   }
@@ -1076,7 +1076,7 @@ private:
   _LIBCPP_HIDE_FROM_ABI void __annotate_shrink_back(size_type __old_size, size_type __old_start) const _NOEXCEPT {
     (void)__old_size;
     (void)__old_start;
-#  if _LIBCPP_HAS_ASAN
+#  if __has_feature(address_sanitizer)
     __annotate_from_to(__old_start + size(), __old_start + __old_size, __asan_poison, __asan_back_moved);
 #  endif
   }
@@ -1089,7 +1089,7 @@ private:
   __annotate_whole_block(size_t __block_index, __asan_annotation_type __annotation_type) const _NOEXCEPT {
     (void)__block_index;
     (void)__annotation_type;
-#  if _LIBCPP_HAS_ASAN
+#  if __has_feature(address_sanitizer)
     __map_const_iterator __block_it = __map_.begin() + __block_index;
     const void* __block_start       = std::__to_address(*__block_it);
     const void* __block_end         = std::__to_address(*__block_it + __block_size);
@@ -1102,7 +1102,7 @@ private:
     }
 #  endif
   }
-#  if _LIBCPP_HAS_ASAN
+#  if __has_feature(address_sanitizer)
 
 public:
   _LIBCPP_HIDE_FROM_ABI bool __verify_asan_annotations() const _NOEXCEPT {
@@ -1164,7 +1164,7 @@ public:
   }
 
 private:
-#  endif // _LIBCPP_HAS_ASAN
+#  endif // __has_feature(address_sanitizer)
   _LIBCPP_HIDE_FROM_ABI bool __maybe_remove_front_spare(bool __keep_one = true) {
     if (__front_spare_blocks() >= 2 || (!__keep_one && __front_spare_blocks())) {
       __annotate_whole_block(0, __asan_unposion);
diff --git a/libcxx/include/string b/libcxx/include/string
index 67c4c53..c450ddc 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -678,7 +678,7 @@ basic_string<char32_t> operator""s( const char32_t *str, size_t len );
 _LIBCPP_PUSH_MACROS
 #  include <__undef_macros>
 
-#  if _LIBCPP_HAS_ASAN && _LIBCPP_INSTRUMENTED_WITH_ASAN
+#  if __has_feature(address_sanitizer) && _LIBCPP_INSTRUMENTED_WITH_ASAN
 #    define _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS __attribute__((__no_sanitize__("address")))
 // This macro disables AddressSanitizer (ASan) instrumentation for a specific function,
 // allowing memory accesses that would normally trigger ASan errors to proceed without crashing.
@@ -749,7 +749,7 @@ public:
   //
   // This string implementation doesn't contain any references into itself. It only contains a bit that says whether
   // it is in small or large string mode, so the entire structure is trivially relocatable if its members are.
-#  if _LIBCPP_HAS_ASAN && _LIBCPP_INSTRUMENTED_WITH_ASAN
+#  if __has_feature(address_sanitizer) && _LIBCPP_INSTRUMENTED_WITH_ASAN
   // When compiling with AddressSanitizer (ASan), basic_string cannot be trivially
   // relocatable. Because the object's memory might be poisoned when its content
   // is kept inside objects memory (short string optimization), instead of in allocated
@@ -771,7 +771,7 @@ public:
                       basic_string,
                       void>;
 
-#  if _LIBCPP_HAS_ASAN && _LIBCPP_INSTRUMENTED_WITH_ASAN
+#  if __has_feature(address_sanitizer) && _LIBCPP_INSTRUMENTED_WITH_ASAN
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pointer __asan_volatile_wrapper(pointer const& __ptr) const {
     if (__libcpp_is_constant_evaluated())
       return __ptr;
diff --git a/libcxx/test/libcxx/type_traits/is_trivially_relocatable.compile.pass.cpp b/libcxx/test/libcxx/type_traits/is_trivially_relocatable.compile.pass.cpp
index 213d06d..8066925 100644
--- a/libcxx/test/libcxx/type_traits/is_trivially_relocatable.compile.pass.cpp
+++ b/libcxx/test/libcxx/type_traits/is_trivially_relocatable.compile.pass.cpp
@@ -87,7 +87,7 @@ static_assert(!std::__libcpp_is_trivially_relocatable<std::array<NotTriviallyCop
 static_assert(std::__libcpp_is_trivially_relocatable<std::array<std::unique_ptr<int>, 1> >::value, "");
 
 // basic_string
-#if !_LIBCPP_HAS_ASAN || !_LIBCPP_INSTRUMENTED_WITH_ASAN
+#if !__has_feature(address_sanitizer) || !_LIBCPP_INSTRUMENTED_WITH_ASAN
 struct MyChar {
   char c;
 };
diff --git a/libcxx/test/support/MinSequenceContainer.h b/libcxx/test/support/MinSequenceContainer.h
index 6e61aff..ccc17b7 100644
--- a/libcxx/test/support/MinSequenceContainer.h
+++ b/libcxx/test/support/MinSequenceContainer.h
@@ -28,6 +28,13 @@ struct MinSequenceContainer {
   template <class It>
   explicit MinSequenceContainer(It first, It last) : data_(first, last) {}
   MinSequenceContainer(std::initializer_list<T> il) : data_(il) {}
+
+  template <class It>
+  void assign(It first, It last) {
+    data_.assign(first, last);
+  }
+  void assign(std::initializer_list<T> il) { data_.assign(il); }
+  void assign(size_type n, value_type t) { data_.assign(n, t); }
   iterator begin() { return iterator(data_.data()); }
   const_iterator begin() const { return const_iterator(data_.data()); }
   const_iterator cbegin() const { return const_iterator(data_.data()); }
@@ -47,6 +54,11 @@ struct MinSequenceContainer {
     return from_vector_iterator(data_.insert(to_vector_iterator(p), std::move(value)));
   }
 
+  template <class Range>
+  iterator insert_range(const_iterator p, Range&& rg) {
+    return from_vector_iterator(data_.insert_range(to_vector_iterator(p), std::forward<Range>(rg)));
+  }
+
   iterator erase(const_iterator first, const_iterator last) {
     return from_vector_iterator(data_.erase(to_vector_iterator(first), to_vector_iterator(last)));
   }
diff --git a/lld/COFF/TypeMerger.h b/lld/COFF/TypeMerger.h
index b4e3d6e..effb6d6 100644
--- a/lld/COFF/TypeMerger.h
+++ b/lld/COFF/TypeMerger.h
@@ -9,6 +9,7 @@
 #ifndef LLD_COFF_TYPEMERGER_H
 #define LLD_COFF_TYPEMERGER_H
 
+#include "COFFLinkerContext.h"
 #include "Config.h"
 #include "DebugTypes.h"
 #include "lld/Common/Timer.h"
diff --git a/lld/ELF/DWARF.h b/lld/ELF/DWARF.h
index 64c25c7..19b1764 100644
--- a/lld/ELF/DWARF.h
+++ b/lld/ELF/DWARF.h
@@ -10,6 +10,7 @@
 #define LLD_ELF_DWARF_H
 
 #include "InputFiles.h"
+#include "InputSection.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
diff --git a/lld/ELF/Symbols.h b/lld/ELF/Symbols.h
index 64f2f6e..c117e3b 100644
--- a/lld/ELF/Symbols.h
+++ b/lld/ELF/Symbols.h
@@ -343,7 +343,7 @@ public:
     flags.fetch_or(bits, std::memory_order_relaxed);
   }
   bool hasFlag(uint16_t bit) const {
-    assert(bit && (bit & (bit - 1)) == 0 && "bit must be a power of 2");
+    assert(llvm::has_single_bit(bit) && "bit must be a power of 2");
     return flags.load(std::memory_order_relaxed) & bit;
   }
 
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index ec1f87a..10dc688 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -1030,7 +1030,7 @@ findOrphanPos(Ctx &ctx, SmallVectorImpl<SectionCommand *>::iterator b,
   // This matches bfd's behavior and is convenient when the linker script fully
   // specifies the start of the file, but doesn't care about the end (the non
   // alloc sections for example).
-  if (std::find_if(i, e, isOutputSecWithInputSections) == e)
+  if (std::none_of(i, e, isOutputSecWithInputSections))
     return e;
 
   while (i != e && shouldSkip(*i))
diff --git a/lldb/include/lldb/Core/PluginManager.h b/lldb/include/lldb/Core/PluginManager.h
index e2f709e..e7b1691 100644
--- a/lldb/include/lldb/Core/PluginManager.h
+++ b/lldb/include/lldb/Core/PluginManager.h
@@ -19,10 +19,13 @@
 #include "lldb/lldb-enumerations.h"
 #include "lldb/lldb-forward.h"
 #include "lldb/lldb-private-interfaces.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/JSON.h"
 
 #include <cstddef>
 #include <cstdint>
+#include <functional>
 #include <vector>
 
 #define LLDB_PLUGIN_DEFINE_ADV(ClassName, PluginName)                          \
@@ -55,12 +58,67 @@ struct RegisteredPluginInfo {
   bool enabled = false;
 };
 
+// Define some data structures to describe known plugin "namespaces".
+// The PluginManager is organized into a series of static functions
+// that operate on different types of plugins. For example SystemRuntime
+// and ObjectFile plugins.
+//
+// The namespace name is used a prefix when matching plugin names. For example,
+// if we have an "macosx" plugin in the "system-runtime" namespace then we will
+// match a plugin name pattern against the "system-runtime.macosx" name.
+//
+// The plugin namespace here is used so we can operate on all the plugins
+// of a given type so it is easy to enable or disable them as a group.
+using GetPluginInfo = std::function<std::vector<RegisteredPluginInfo>()>;
+using SetPluginEnabled = std::function<bool(llvm::StringRef, bool)>;
+struct PluginNamespace {
+  llvm::StringRef name;
+  GetPluginInfo get_info;
+  SetPluginEnabled set_enabled;
+};
+
 class PluginManager {
 public:
   static void Initialize();
 
   static void Terminate();
 
+  // Support for enabling and disabling plugins.
+
+  // Return the plugins that can be enabled or disabled by the user.
+  static llvm::ArrayRef<PluginNamespace> GetPluginNamespaces();
+
+  // Generate a json object that describes the plugins that are available.
+  // This is a json representation of the plugin info returned by
+  // GetPluginNamespaces().
+  //
+  //    {
+  //       <plugin-namespace>: [
+  //           {
+  //               "enabled": <bool>,
+  //               "name": <plugin-name>,
+  //           },
+  //           ...
+  //       ],
+  //       ...
+  //    }
+  //
+  // If pattern is given it will be used to filter the plugins that are
+  // are returned. The pattern filters the plugin names using the
+  // PluginManager::MatchPluginName() function.
+  static llvm::json::Object GetJSON(llvm::StringRef pattern = "");
+
+  // Return true if the pattern matches the plugin name.
+  //
+  // The pattern matches the name if it is exactly equal to the namespace name
+  // or if it is equal to the qualified name, which is the namespace name
+  // followed by a dot and the plugin name (e.g. "system-runtime.foo").
+  //
+  // An empty pattern matches all plugins.
+  static bool MatchPluginName(llvm::StringRef pattern,
+                              const PluginNamespace &plugin_ns,
+                              const RegisteredPluginInfo &plugin);
+
   // ABI
   static bool RegisterPlugin(llvm::StringRef name, llvm::StringRef description,
                              ABICreateInstance create_callback);
@@ -491,6 +549,12 @@ public:
   static InstrumentationRuntimeCreateInstance
   GetInstrumentationRuntimeCreateCallbackAtIndex(uint32_t idx);
 
+  static std::vector<RegisteredPluginInfo>
+  GetInstrumentationRuntimePluginInfo();
+
+  static bool SetInstrumentationRuntimePluginEnabled(llvm::StringRef name,
+                                                     bool enabled);
+
   // TypeSystem
   static bool RegisterPlugin(llvm::StringRef name, llvm::StringRef description,
                              TypeSystemCreateInstance create_callback,
diff --git a/lldb/include/lldb/Interpreter/CommandOptionArgumentTable.h b/lldb/include/lldb/Interpreter/CommandOptionArgumentTable.h
index 1875ff6..8535dfc 100644
--- a/lldb/include/lldb/Interpreter/CommandOptionArgumentTable.h
+++ b/lldb/include/lldb/Interpreter/CommandOptionArgumentTable.h
@@ -314,6 +314,7 @@ static constexpr CommandObject::ArgumentTableEntry g_argument_table[] = {
     { lldb::eArgTypeModule, "module", lldb::CompletionType::eModuleCompletion, {}, { nullptr, false }, "The name of a module loaded into the current target." },
     { lldb::eArgTypeCPUName, "cpu-name", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The name of a CPU." },
     { lldb::eArgTypeCPUFeatures, "cpu-features", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The CPU feature string." },
+    { lldb::eArgTypeManagedPlugin, "managed-plugin", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "Plugins managed by the PluginManager" },
     // clang-format on
 };
 
diff --git a/lldb/include/lldb/Symbol/ObjectFile.h b/lldb/include/lldb/Symbol/ObjectFile.h
index 7fca6383..4356759 100644
--- a/lldb/include/lldb/Symbol/ObjectFile.h
+++ b/lldb/include/lldb/Symbol/ObjectFile.h
@@ -709,6 +709,13 @@ public:
       llvm::StringRef name,
       lldb::SymbolType symbol_type_hint = lldb::eSymbolTypeUndefined);
 
+  /// Parses the section type from a section name for DWARF sections.
+  ///
+  /// The \a name must be stripped of the default prefix (e.g. ".debug_" or
+  /// "__debug_"). If there's no matching section type, \a eSectionTypeOther
+  /// will be returned.
+  static lldb::SectionType GetDWARFSectionTypeFromName(llvm::StringRef name);
+
   /// Loads this objfile to memory.
   ///
   /// Loads the bits needed to create an executable image to the memory. It is
diff --git a/lldb/include/lldb/Target/RegisterContextUnwind.h b/lldb/include/lldb/Target/RegisterContextUnwind.h
index 044a387..b10a3648 100644
--- a/lldb/include/lldb/Target/RegisterContextUnwind.h
+++ b/lldb/include/lldb/Target/RegisterContextUnwind.h
@@ -151,6 +151,9 @@ private:
       uint32_t lldb_regnum,
       lldb_private::UnwindLLDB::ConcreteRegisterLocation &regloc);
 
+  std::optional<UnwindPlan::Row::AbstractRegisterLocation>
+  GetAbstractRegisterLocation(uint32_t lldb_regnum, lldb::RegisterKind &kind);
+
   bool ReadRegisterValueFromRegisterLocation(
       lldb_private::UnwindLLDB::ConcreteRegisterLocation regloc,
       const lldb_private::RegisterInfo *reg_info,
diff --git a/lldb/include/lldb/Target/Statistics.h b/lldb/include/lldb/Target/Statistics.h
index 9ac3217..2d0d25c 100644
--- a/lldb/include/lldb/Target/Statistics.h
+++ b/lldb/include/lldb/Target/Statistics.h
@@ -196,12 +196,21 @@ public:
     return !GetSummaryOnly();
   }
 
+  void SetIncludePlugins(bool value) { m_include_plugins = value; }
+  bool GetIncludePlugins() const {
+    if (m_include_plugins.has_value())
+      return m_include_plugins.value();
+    // Default to true in both default mode and summary mode.
+    return true;
+  }
+
 private:
   std::optional<bool> m_summary_only;
   std::optional<bool> m_load_all_debug_info;
   std::optional<bool> m_include_targets;
   std::optional<bool> m_include_modules;
   std::optional<bool> m_include_transcript;
+  std::optional<bool> m_include_plugins;
 };
 
 /// A class that represents statistics about a TypeSummaryProviders invocations
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 6d10cc8..eeb7299 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -663,6 +663,7 @@ enum CommandArgumentType {
   eArgTypeModule,
   eArgTypeCPUName,
   eArgTypeCPUFeatures,
+  eArgTypeManagedPlugin,
   eArgTypeLastArg // Always keep this entry as the last entry in this
                   // enumeration!!
 };
diff --git a/lldb/source/Breakpoint/WatchpointResource.cpp b/lldb/source/Breakpoint/WatchpointResource.cpp
index 49d9d12..c2f510f 100644
--- a/lldb/source/Breakpoint/WatchpointResource.cpp
+++ b/lldb/source/Breakpoint/WatchpointResource.cpp
@@ -56,8 +56,7 @@ void WatchpointResource::AddConstituent(const WatchpointSP &wp_sp) {
 
 void WatchpointResource::RemoveConstituent(WatchpointSP &wp_sp) {
   std::lock_guard<std::mutex> guard(m_constituents_mutex);
-  const auto &it =
-      std::find(m_constituents.begin(), m_constituents.end(), wp_sp);
+  auto it = llvm::find(m_constituents, wp_sp);
   if (it != m_constituents.end())
     m_constituents.erase(it);
 }
diff --git a/lldb/source/Commands/CommandObjectPlugin.cpp b/lldb/source/Commands/CommandObjectPlugin.cpp
index f3108b8..cdc9006 100644
--- a/lldb/source/Commands/CommandObjectPlugin.cpp
+++ b/lldb/source/Commands/CommandObjectPlugin.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "CommandObjectPlugin.h"
+#include "lldb/Core/PluginManager.h"
+#include "lldb/Host/OptionParser.h"
 #include "lldb/Interpreter/CommandInterpreter.h"
 #include "lldb/Interpreter/CommandReturnObject.h"
 
@@ -46,12 +48,287 @@ protected:
   }
 };
 
+namespace {
+// Helper function to perform an action on each matching plugin.
+// The action callback is given the containing namespace along with plugin info
+// for each matching plugin.
+static int ActOnMatchingPlugins(
+    const llvm::StringRef pattern,
+    std::function<void(const PluginNamespace &plugin_namespace,
+                       const std::vector<RegisteredPluginInfo> &plugin_info)>
+        action) {
+  int num_matching = 0;
+
+  for (const PluginNamespace &plugin_namespace :
+       PluginManager::GetPluginNamespaces()) {
+
+    std::vector<RegisteredPluginInfo> matching_plugins;
+    for (const RegisteredPluginInfo &plugin_info :
+         plugin_namespace.get_info()) {
+      if (PluginManager::MatchPluginName(pattern, plugin_namespace,
+                                         plugin_info))
+        matching_plugins.push_back(plugin_info);
+    }
+
+    if (!matching_plugins.empty()) {
+      num_matching += matching_plugins.size();
+      action(plugin_namespace, matching_plugins);
+    }
+  }
+
+  return num_matching;
+}
+
+// Call the "SetEnable" function for each matching plugins.
+// Used to share the majority of the code between the enable
+// and disable commands.
+int SetEnableOnMatchingPlugins(const llvm::StringRef &pattern,
+                               CommandReturnObject &result, bool enabled) {
+  return ActOnMatchingPlugins(
+      pattern, [&](const PluginNamespace &plugin_namespace,
+                   const std::vector<RegisteredPluginInfo> &plugins) {
+        result.AppendMessage(plugin_namespace.name);
+        for (const auto &plugin : plugins) {
+          if (!plugin_namespace.set_enabled(plugin.name, enabled)) {
+            result.AppendErrorWithFormat("failed to enable plugin %s.%s",
+                                         plugin_namespace.name.data(),
+                                         plugin.name.data());
+            continue;
+          }
+
+          result.AppendMessageWithFormat(
+              "  %s %-30s %s\n", enabled ? "[+]" : "[-]", plugin.name.data(),
+              plugin.description.data());
+        }
+      });
+}
+
+static std::string ConvertJSONToPrettyString(const llvm::json::Value &json) {
+  std::string str;
+  llvm::raw_string_ostream os(str);
+  os << llvm::formatv("{0:2}", json).str();
+  os.flush();
+  return str;
+}
+
+#define LLDB_OPTIONS_plugin_list
+#include "CommandOptions.inc"
+
+// These option definitions are used by the plugin list command.
+class PluginListCommandOptions : public Options {
+public:
+  PluginListCommandOptions() = default;
+
+  ~PluginListCommandOptions() override = default;
+
+  Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                        ExecutionContext *execution_context) override {
+    Status error;
+    const int short_option = m_getopt_table[option_idx].val;
+
+    switch (short_option) {
+    case 'j':
+      m_json_format = true;
+      break;
+    default:
+      llvm_unreachable("Unimplemented option");
+    }
+
+    return error;
+  }
+
+  void OptionParsingStarting(ExecutionContext *execution_context) override {
+    m_json_format = false;
+  }
+
+  llvm::ArrayRef<OptionDefinition> GetDefinitions() override {
+    return llvm::ArrayRef(g_plugin_list_options);
+  }
+
+  // Instance variables to hold the values for command options.
+  bool m_json_format = false;
+};
+} // namespace
+
+class CommandObjectPluginList : public CommandObjectParsed {
+public:
+  CommandObjectPluginList(CommandInterpreter &interpreter)
+      : CommandObjectParsed(interpreter, "plugin list",
+                            "Report info about registered LLDB plugins.",
+                            nullptr) {
+    AddSimpleArgumentList(eArgTypeManagedPlugin);
+    SetHelpLong(R"(
+Display information about registered plugins.
+The plugin information is formatted as shown below:
+
+    <plugin-namespace>
+      [+] <plugin-name>                  Plugin #1 description
+      [-] <plugin-name>                  Plugin #2 description
+
+An enabled plugin is marked with [+] and a disabled plugin is marked with [-].
+
+Plugins can be listed by namespace and name with:
+
+  plugin list <plugin-namespace>[.<plugin-name>]
+
+Plugins can be listed by namespace alone or with a fully qualified name. When listed
+with just a namespace all plugins in that namespace are listed.  When no arguments
+are given all plugins are listed.
+
+Examples:
+List all plugins
+
+  (lldb) plugin list
+
+List all plugins in the system-runtime namespace
+
+  (lldb) plugin list system-runtime
+
+List only the plugin 'foo' matching a fully qualified name exactly
+
+  (lldb) plugin list system-runtime.foo
+)");
+  }
+
+  ~CommandObjectPluginList() override = default;
+
+  Options *GetOptions() override { return &m_options; }
+
+protected:
+  void DoExecute(Args &command, CommandReturnObject &result) override {
+    size_t argc = command.GetArgumentCount();
+    result.SetStatus(eReturnStatusSuccessFinishResult);
+
+    // Create a temporary vector to hold the patterns to simplify the logic
+    // for the case when the user passes no patterns
+    std::vector<llvm::StringRef> patterns;
+    patterns.reserve(argc == 0 ? 1 : argc);
+    if (argc == 0)
+      patterns.push_back("");
+    else
+      for (size_t i = 0; i < argc; ++i)
+        patterns.push_back(command[i].ref());
+
+    if (m_options.m_json_format)
+      OutputJsonFormat(patterns, result);
+    else
+      OutputTextFormat(patterns, result);
+  }
+
+private:
+  void OutputJsonFormat(const std::vector<llvm::StringRef> &patterns,
+                        CommandReturnObject &result) {
+    llvm::json::Object obj;
+    bool found_empty = false;
+    for (const llvm::StringRef pattern : patterns) {
+      llvm::json::Object pat_obj = PluginManager::GetJSON(pattern);
+      if (pat_obj.empty()) {
+        found_empty = true;
+        result.AppendErrorWithFormat(
+            "Found no matching plugins for pattern '%s'", pattern.data());
+        break;
+      }
+      for (auto &entry : pat_obj) {
+        obj[entry.first] = std::move(entry.second);
+      }
+    }
+    if (!found_empty) {
+      result.AppendMessage(ConvertJSONToPrettyString(std::move(obj)));
+    }
+  }
+
+  void OutputTextFormat(const std::vector<llvm::StringRef> &patterns,
+                        CommandReturnObject &result) {
+    for (const llvm::StringRef pattern : patterns) {
+      int num_matching = ActOnMatchingPlugins(
+          pattern, [&](const PluginNamespace &plugin_namespace,
+                       const std::vector<RegisteredPluginInfo> &plugins) {
+            result.AppendMessage(plugin_namespace.name);
+            for (auto &plugin : plugins) {
+              result.AppendMessageWithFormat(
+                  "  %s %-30s %s\n", plugin.enabled ? "[+]" : "[-]",
+                  plugin.name.data(), plugin.description.data());
+            }
+          });
+      if (num_matching == 0) {
+        result.AppendErrorWithFormat(
+            "Found no matching plugins for pattern '%s'", pattern.data());
+        break;
+      }
+    }
+  }
+
+  PluginListCommandOptions m_options;
+};
+
+static void DoPluginEnableDisable(Args &command, CommandReturnObject &result,
+                                  bool enable) {
+  const char *name = enable ? "enable" : "disable";
+  size_t argc = command.GetArgumentCount();
+  if (argc == 0) {
+    result.AppendErrorWithFormat("'plugin %s' requires one or more arguments",
+                                 name);
+    return;
+  }
+  result.SetStatus(eReturnStatusSuccessFinishResult);
+
+  for (size_t i = 0; i < argc; ++i) {
+    llvm::StringRef pattern = command[i].ref();
+    int num_matching = SetEnableOnMatchingPlugins(pattern, result, enable);
+
+    if (num_matching == 0) {
+      result.AppendErrorWithFormat(
+          "Found no matching plugins to %s for pattern '%s'", name,
+          pattern.data());
+      break;
+    }
+  }
+}
+
+class CommandObjectPluginEnable : public CommandObjectParsed {
+public:
+  CommandObjectPluginEnable(CommandInterpreter &interpreter)
+      : CommandObjectParsed(interpreter, "plugin enable",
+                            "Enable registered LLDB plugins.", nullptr) {
+    AddSimpleArgumentList(eArgTypeManagedPlugin);
+  }
+
+  ~CommandObjectPluginEnable() override = default;
+
+protected:
+  void DoExecute(Args &command, CommandReturnObject &result) override {
+    DoPluginEnableDisable(command, result, /*enable=*/true);
+  }
+};
+
+class CommandObjectPluginDisable : public CommandObjectParsed {
+public:
+  CommandObjectPluginDisable(CommandInterpreter &interpreter)
+      : CommandObjectParsed(interpreter, "plugin disable",
+                            "Disable registered LLDB plugins.", nullptr) {
+    AddSimpleArgumentList(eArgTypeManagedPlugin);
+  }
+
+  ~CommandObjectPluginDisable() override = default;
+
+protected:
+  void DoExecute(Args &command, CommandReturnObject &result) override {
+    DoPluginEnableDisable(command, result, /*enable=*/false);
+  }
+};
+
 CommandObjectPlugin::CommandObjectPlugin(CommandInterpreter &interpreter)
     : CommandObjectMultiword(interpreter, "plugin",
                              "Commands for managing LLDB plugins.",
                              "plugin <subcommand> [<subcommand-options>]") {
   LoadSubCommand("load",
                  CommandObjectSP(new CommandObjectPluginLoad(interpreter)));
+  LoadSubCommand("list",
+                 CommandObjectSP(new CommandObjectPluginList(interpreter)));
+  LoadSubCommand("enable",
+                 CommandObjectSP(new CommandObjectPluginEnable(interpreter)));
+  LoadSubCommand("disable",
+                 CommandObjectSP(new CommandObjectPluginDisable(interpreter)));
 }
 
 CommandObjectPlugin::~CommandObjectPlugin() = default;
diff --git a/lldb/source/Commands/CommandObjectStats.cpp b/lldb/source/Commands/CommandObjectStats.cpp
index 7d333af..b77c44b 100644
--- a/lldb/source/Commands/CommandObjectStats.cpp
+++ b/lldb/source/Commands/CommandObjectStats.cpp
@@ -103,6 +103,13 @@ class CommandObjectStatsDump : public CommandObjectParsed {
         else
           error = Status::FromError(bool_or_error.takeError());
         break;
+      case 'p':
+        if (llvm::Expected<bool> bool_or_error =
+                OptionArgParser::ToBoolean("--plugins", option_arg))
+          m_stats_options.SetIncludePlugins(*bool_or_error);
+        else
+          error = Status::FromError(bool_or_error.takeError());
+        break;
       default:
         llvm_unreachable("Unimplemented option");
       }
diff --git a/lldb/source/Commands/Options.td b/lldb/source/Commands/Options.td
index 848d42da..75bdffe 100644
--- a/lldb/source/Commands/Options.td
+++ b/lldb/source/Commands/Options.td
@@ -683,6 +683,11 @@ let Command = "platform shell" in {
     Desc<"Shell interpreter path. This is the binary used to run the command.">;
 }
 
+let Command = "plugin list" in {
+  def plugin_list_json : Option<"json", "j">,
+                         Desc<"Output the plugin list in json format.">;
+}
+
 let Command = "process launch" in {
   def process_launch_stop_at_entry : Option<"stop-at-entry", "s">,
     Desc<"Stop at the entry point of the program when launching a process.">;
@@ -1484,5 +1489,11 @@ let Command = "statistics dump" in {
     "scripts executed during a debug session. "
     "Defaults to true, unless the '--summary' mode is enabled, in which case "
     "this is turned off unless specified.">;
-
+  def statistics_dump_plugins
+      : Option<"plugins", "p">,
+        Group<1>,
+        Arg<"Boolean">,
+        Desc<"Dump statistics for known plugins including name, order, and "
+             "enabled state. Defaults to true for both summary and default "
+             "mode.">;
 }
diff --git a/lldb/source/Core/PluginManager.cpp b/lldb/source/Core/PluginManager.cpp
index de815e6..5d44434 100644
--- a/lldb/source/Core/PluginManager.cpp
+++ b/lldb/source/Core/PluginManager.cpp
@@ -181,6 +181,56 @@ void PluginManager::Terminate() {
   plugin_map.clear();
 }
 
+llvm::ArrayRef<PluginNamespace> PluginManager::GetPluginNamespaces() {
+  // Currently supported set of plugin namespaces. This will be expanded
+  // over time.
+  static PluginNamespace PluginNamespaces[] = {
+      {"system-runtime", PluginManager::GetSystemRuntimePluginInfo,
+       PluginManager::SetSystemRuntimePluginEnabled},
+      {"instrumentation-runtime",
+       PluginManager::GetInstrumentationRuntimePluginInfo,
+       PluginManager::SetInstrumentationRuntimePluginEnabled}};
+
+  return PluginNamespaces;
+}
+
+llvm::json::Object PluginManager::GetJSON(llvm::StringRef pattern) {
+  llvm::json::Object plugin_stats;
+
+  for (const PluginNamespace &plugin_ns : GetPluginNamespaces()) {
+    llvm::json::Array namespace_stats;
+
+    for (const RegisteredPluginInfo &plugin : plugin_ns.get_info()) {
+      if (MatchPluginName(pattern, plugin_ns, plugin)) {
+        llvm::json::Object plugin_json;
+        plugin_json.try_emplace("name", plugin.name);
+        plugin_json.try_emplace("enabled", plugin.enabled);
+        namespace_stats.emplace_back(std::move(plugin_json));
+      }
+    }
+    if (!namespace_stats.empty())
+      plugin_stats.try_emplace(plugin_ns.name, std::move(namespace_stats));
+  }
+
+  return plugin_stats;
+}
+
+bool PluginManager::MatchPluginName(llvm::StringRef pattern,
+                                    const PluginNamespace &plugin_ns,
+                                    const RegisteredPluginInfo &plugin_info) {
+  // The empty pattern matches all plugins.
+  if (pattern.empty())
+    return true;
+
+  // Check if the pattern matches the namespace.
+  if (pattern == plugin_ns.name)
+    return true;
+
+  // Check if the pattern matches the qualified name.
+  std::string qualified_name = (plugin_ns.name + "." + plugin_info.name).str();
+  return pattern == qualified_name;
+}
+
 template <typename Callback> struct PluginInstance {
   typedef Callback CallbackType;
 
@@ -1513,6 +1563,16 @@ PluginManager::GetInstrumentationRuntimeCreateCallbackAtIndex(uint32_t idx) {
   return GetInstrumentationRuntimeInstances().GetCallbackAtIndex(idx);
 }
 
+std::vector<RegisteredPluginInfo>
+PluginManager::GetInstrumentationRuntimePluginInfo() {
+  return GetInstrumentationRuntimeInstances().GetPluginInfoForAllInstances();
+}
+
+bool PluginManager::SetInstrumentationRuntimePluginEnabled(llvm::StringRef name,
+                                                           bool enable) {
+  return GetInstrumentationRuntimeInstances().SetInstanceEnabled(name, enable);
+}
+
 #pragma mark TypeSystem
 
 struct TypeSystemInstance : public PluginInstance<TypeSystemCreateInstance> {
diff --git a/lldb/source/Expression/DWARFExpression.cpp b/lldb/source/Expression/DWARFExpression.cpp
index 5ae3c9e..6613243 100644
--- a/lldb/source/Expression/DWARFExpression.cpp
+++ b/lldb/source/Expression/DWARFExpression.cpp
@@ -80,8 +80,8 @@ void DWARFExpression::DumpLocation(Stream *s, lldb::DescriptionLevel level,
   };
   llvm::DIDumpOptions DumpOpts;
   DumpOpts.GetNameForDWARFReg = GetRegName;
-  llvm::DWARFExpression(m_data.GetAsLLVM(), m_data.GetAddressByteSize())
-      .print(s->AsRawOstream(), DumpOpts, nullptr);
+  llvm::DWARFExpression E(m_data.GetAsLLVM(), m_data.GetAddressByteSize());
+  llvm::DWARFExpressionPrinter::print(&E, s->AsRawOstream(), DumpOpts, nullptr);
 }
 
 RegisterKind DWARFExpression::GetRegisterKind() const { return m_reg_kind; }
diff --git a/lldb/source/Expression/FunctionCaller.cpp b/lldb/source/Expression/FunctionCaller.cpp
index 83cac13..6c93d94 100644
--- a/lldb/source/Expression/FunctionCaller.cpp
+++ b/lldb/source/Expression/FunctionCaller.cpp
@@ -323,8 +323,7 @@ bool FunctionCaller::FetchFunctionResults(ExecutionContext &exe_ctx,
 void FunctionCaller::DeallocateFunctionResults(ExecutionContext &exe_ctx,
                                                lldb::addr_t args_addr) {
   std::list<lldb::addr_t>::iterator pos;
-  pos = std::find(m_wrapper_args_addrs.begin(), m_wrapper_args_addrs.end(),
-                  args_addr);
+  pos = llvm::find(m_wrapper_args_addrs, args_addr);
   if (pos != m_wrapper_args_addrs.end())
     m_wrapper_args_addrs.erase(pos);
 
diff --git a/lldb/source/Plugins/ObjectFile/COFF/ObjectFileCOFF.cpp b/lldb/source/Plugins/ObjectFile/COFF/ObjectFileCOFF.cpp
index a7ad5d2..1121f69 100644
--- a/lldb/source/Plugins/ObjectFile/COFF/ObjectFileCOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/COFF/ObjectFileCOFF.cpp
@@ -191,19 +191,15 @@ void ObjectFileCOFF::CreateSections(lldb_private::SectionList &sections) {
 
   auto SectionType = [](StringRef Name,
                         const coff_section *Section) -> lldb::SectionType {
-    lldb::SectionType type =
-        StringSwitch<lldb::SectionType>(Name)
-            // DWARF Debug Sections
-            .Case(".debug_abbrev", eSectionTypeDWARFDebugAbbrev)
-            .Case(".debug_info", eSectionTypeDWARFDebugInfo)
-            .Case(".debug_line", eSectionTypeDWARFDebugLine)
-            .Case(".debug_pubnames", eSectionTypeDWARFDebugPubNames)
-            .Case(".debug_pubtypes", eSectionTypeDWARFDebugPubTypes)
-            .Case(".debug_str", eSectionTypeDWARFDebugStr)
-            // CodeView Debug Sections: .debug$S, .debug$T
-            .StartsWith(".debug$", eSectionTypeDebug)
-            .Case("clangast", eSectionTypeOther)
-            .Default(eSectionTypeInvalid);
+    // DWARF Debug Sections
+    if (Name.consume_front(".debug_"))
+      return GetDWARFSectionTypeFromName(Name);
+
+    lldb::SectionType type = StringSwitch<lldb::SectionType>(Name)
+                                 // CodeView Debug Sections: .debug$S, .debug$T
+                                 .StartsWith(".debug$", eSectionTypeDebug)
+                                 .Case("clangast", eSectionTypeOther)
+                                 .Default(eSectionTypeInvalid);
     if (type != eSectionTypeInvalid)
       return type;
 
diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
index 13e1198..f69358d 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
@@ -1653,39 +1653,9 @@ lldb::user_id_t ObjectFileELF::GetSectionIndexByName(const char *name) {
 }
 
 static SectionType GetSectionTypeFromName(llvm::StringRef Name) {
-  if (Name.consume_front(".debug_")) {
-    return llvm::StringSwitch<SectionType>(Name)
-        .Case("abbrev", eSectionTypeDWARFDebugAbbrev)
-        .Case("abbrev.dwo", eSectionTypeDWARFDebugAbbrevDwo)
-        .Case("addr", eSectionTypeDWARFDebugAddr)
-        .Case("aranges", eSectionTypeDWARFDebugAranges)
-        .Case("cu_index", eSectionTypeDWARFDebugCuIndex)
-        .Case("frame", eSectionTypeDWARFDebugFrame)
-        .Case("info", eSectionTypeDWARFDebugInfo)
-        .Case("info.dwo", eSectionTypeDWARFDebugInfoDwo)
-        .Cases("line", "line.dwo", eSectionTypeDWARFDebugLine)
-        .Cases("line_str", "line_str.dwo", eSectionTypeDWARFDebugLineStr)
-        .Case("loc", eSectionTypeDWARFDebugLoc)
-        .Case("loc.dwo", eSectionTypeDWARFDebugLocDwo)
-        .Case("loclists", eSectionTypeDWARFDebugLocLists)
-        .Case("loclists.dwo", eSectionTypeDWARFDebugLocListsDwo)
-        .Case("macinfo", eSectionTypeDWARFDebugMacInfo)
-        .Cases("macro", "macro.dwo", eSectionTypeDWARFDebugMacro)
-        .Case("names", eSectionTypeDWARFDebugNames)
-        .Case("pubnames", eSectionTypeDWARFDebugPubNames)
-        .Case("pubtypes", eSectionTypeDWARFDebugPubTypes)
-        .Case("ranges", eSectionTypeDWARFDebugRanges)
-        .Case("rnglists", eSectionTypeDWARFDebugRngLists)
-        .Case("rnglists.dwo", eSectionTypeDWARFDebugRngListsDwo)
-        .Case("str", eSectionTypeDWARFDebugStr)
-        .Case("str.dwo", eSectionTypeDWARFDebugStrDwo)
-        .Case("str_offsets", eSectionTypeDWARFDebugStrOffsets)
-        .Case("str_offsets.dwo", eSectionTypeDWARFDebugStrOffsetsDwo)
-        .Case("tu_index", eSectionTypeDWARFDebugTuIndex)
-        .Case("types", eSectionTypeDWARFDebugTypes)
-        .Case("types.dwo", eSectionTypeDWARFDebugTypesDwo)
-        .Default(eSectionTypeOther);
-  }
+  if (Name.consume_front(".debug_"))
+    return ObjectFile::GetDWARFSectionTypeFromName(Name);
+
   return llvm::StringSwitch<SectionType>(Name)
       .Case(".ARM.exidx", eSectionTypeARMexidx)
       .Case(".ARM.extab", eSectionTypeARMextab)
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index 3950454..b174192 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -1595,34 +1595,8 @@ static lldb::SectionType GetSectionType(uint32_t flags,
   static ConstString g_sect_name_objc_classlist("__objc_classlist");
   static ConstString g_sect_name_cfstring("__cfstring");
 
-  static ConstString g_sect_name_dwarf_debug_abbrev("__debug_abbrev");
-  static ConstString g_sect_name_dwarf_debug_abbrev_dwo("__debug_abbrev.dwo");
-  static ConstString g_sect_name_dwarf_debug_addr("__debug_addr");
-  static ConstString g_sect_name_dwarf_debug_aranges("__debug_aranges");
-  static ConstString g_sect_name_dwarf_debug_cu_index("__debug_cu_index");
-  static ConstString g_sect_name_dwarf_debug_frame("__debug_frame");
-  static ConstString g_sect_name_dwarf_debug_info("__debug_info");
-  static ConstString g_sect_name_dwarf_debug_info_dwo("__debug_info.dwo");
-  static ConstString g_sect_name_dwarf_debug_line("__debug_line");
-  static ConstString g_sect_name_dwarf_debug_line_dwo("__debug_line.dwo");
-  static ConstString g_sect_name_dwarf_debug_line_str("__debug_line_str");
-  static ConstString g_sect_name_dwarf_debug_loc("__debug_loc");
-  static ConstString g_sect_name_dwarf_debug_loclists("__debug_loclists");
-  static ConstString g_sect_name_dwarf_debug_loclists_dwo("__debug_loclists.dwo");
-  static ConstString g_sect_name_dwarf_debug_macinfo("__debug_macinfo");
-  static ConstString g_sect_name_dwarf_debug_macro("__debug_macro");
-  static ConstString g_sect_name_dwarf_debug_macro_dwo("__debug_macro.dwo");
-  static ConstString g_sect_name_dwarf_debug_names("__debug_names");
-  static ConstString g_sect_name_dwarf_debug_pubnames("__debug_pubnames");
-  static ConstString g_sect_name_dwarf_debug_pubtypes("__debug_pubtypes");
-  static ConstString g_sect_name_dwarf_debug_ranges("__debug_ranges");
-  static ConstString g_sect_name_dwarf_debug_rnglists("__debug_rnglists");
-  static ConstString g_sect_name_dwarf_debug_str("__debug_str");
-  static ConstString g_sect_name_dwarf_debug_str_dwo("__debug_str.dwo");
   static ConstString g_sect_name_dwarf_debug_str_offs("__debug_str_offs");
   static ConstString g_sect_name_dwarf_debug_str_offs_dwo("__debug_str_offs.dwo");
-  static ConstString g_sect_name_dwarf_debug_tu_index("__debug_tu_index");
-  static ConstString g_sect_name_dwarf_debug_types("__debug_types");
   static ConstString g_sect_name_dwarf_apple_names("__apple_names");
   static ConstString g_sect_name_dwarf_apple_types("__apple_types");
   static ConstString g_sect_name_dwarf_apple_namespaces("__apple_namespac");
@@ -1637,62 +1611,15 @@ static lldb::SectionType GetSectionType(uint32_t flags,
   static ConstString g_sect_name_lldb_formatters("__lldbformatters");
   static ConstString g_sect_name_swift_ast("__swift_ast");
 
-  if (section_name == g_sect_name_dwarf_debug_abbrev)
-    return eSectionTypeDWARFDebugAbbrev;
-  if (section_name == g_sect_name_dwarf_debug_abbrev_dwo)
-    return eSectionTypeDWARFDebugAbbrevDwo;
-  if (section_name == g_sect_name_dwarf_debug_addr)
-    return eSectionTypeDWARFDebugAddr;
-  if (section_name == g_sect_name_dwarf_debug_aranges)
-    return eSectionTypeDWARFDebugAranges;
-  if (section_name == g_sect_name_dwarf_debug_cu_index)
-    return eSectionTypeDWARFDebugCuIndex;
-  if (section_name == g_sect_name_dwarf_debug_frame)
-    return eSectionTypeDWARFDebugFrame;
-  if (section_name == g_sect_name_dwarf_debug_info)
-    return eSectionTypeDWARFDebugInfo;
-  if (section_name == g_sect_name_dwarf_debug_info_dwo)
-    return eSectionTypeDWARFDebugInfoDwo;
-  if (section_name == g_sect_name_dwarf_debug_line)
-    return eSectionTypeDWARFDebugLine;
-  if (section_name == g_sect_name_dwarf_debug_line_dwo)
-    return eSectionTypeDWARFDebugLine; // Same as debug_line.
-  if (section_name == g_sect_name_dwarf_debug_line_str)
-    return eSectionTypeDWARFDebugLineStr;
-  if (section_name == g_sect_name_dwarf_debug_loc)
-    return eSectionTypeDWARFDebugLoc;
-  if (section_name == g_sect_name_dwarf_debug_loclists)
-    return eSectionTypeDWARFDebugLocLists;
-  if (section_name == g_sect_name_dwarf_debug_loclists_dwo)
-    return eSectionTypeDWARFDebugLocListsDwo;
-  if (section_name == g_sect_name_dwarf_debug_macinfo)
-    return eSectionTypeDWARFDebugMacInfo;
-  if (section_name == g_sect_name_dwarf_debug_macro)
-    return eSectionTypeDWARFDebugMacro;
-  if (section_name == g_sect_name_dwarf_debug_macro_dwo)
-    return eSectionTypeDWARFDebugMacInfo; // Same as debug_macro.
-  if (section_name == g_sect_name_dwarf_debug_names)
-    return eSectionTypeDWARFDebugNames;
-  if (section_name == g_sect_name_dwarf_debug_pubnames)
-    return eSectionTypeDWARFDebugPubNames;
-  if (section_name == g_sect_name_dwarf_debug_pubtypes)
-    return eSectionTypeDWARFDebugPubTypes;
-  if (section_name == g_sect_name_dwarf_debug_ranges)
-    return eSectionTypeDWARFDebugRanges;
-  if (section_name == g_sect_name_dwarf_debug_rnglists)
-    return eSectionTypeDWARFDebugRngLists;
-  if (section_name == g_sect_name_dwarf_debug_str)
-    return eSectionTypeDWARFDebugStr;
-  if (section_name == g_sect_name_dwarf_debug_str_dwo)
-    return eSectionTypeDWARFDebugStrDwo;
   if (section_name == g_sect_name_dwarf_debug_str_offs)
     return eSectionTypeDWARFDebugStrOffsets;
   if (section_name == g_sect_name_dwarf_debug_str_offs_dwo)
     return eSectionTypeDWARFDebugStrOffsetsDwo;
-  if (section_name == g_sect_name_dwarf_debug_tu_index)
-    return eSectionTypeDWARFDebugTuIndex;
-  if (section_name == g_sect_name_dwarf_debug_types)
-    return eSectionTypeDWARFDebugTypes;
+
+  llvm::StringRef stripped_name = section_name.GetStringRef();
+  if (stripped_name.consume_front("__debug_"))
+    return ObjectFile::GetDWARFSectionTypeFromName(stripped_name);
+
   if (section_name == g_sect_name_dwarf_apple_names)
     return eSectionTypeDWARFAppleNames;
   if (section_name == g_sect_name_dwarf_apple_types)
diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
index 389339a..4984445 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
@@ -976,25 +976,14 @@ SectionType ObjectFilePECOFF::GetSectionType(llvm::StringRef sect_name,
       return eSectionTypeData;
   }
 
+  if (sect_name.consume_front(".debug_"))
+    return GetDWARFSectionTypeFromName(sect_name);
+
   SectionType section_type =
       llvm::StringSwitch<SectionType>(sect_name)
           .Case(".debug", eSectionTypeDebug)
           .Case(".stabstr", eSectionTypeDataCString)
           .Case(".reloc", eSectionTypeOther)
-          .Case(".debug_abbrev", eSectionTypeDWARFDebugAbbrev)
-          .Case(".debug_aranges", eSectionTypeDWARFDebugAranges)
-          .Case(".debug_frame", eSectionTypeDWARFDebugFrame)
-          .Case(".debug_info", eSectionTypeDWARFDebugInfo)
-          .Case(".debug_line", eSectionTypeDWARFDebugLine)
-          .Case(".debug_loc", eSectionTypeDWARFDebugLoc)
-          .Case(".debug_loclists", eSectionTypeDWARFDebugLocLists)
-          .Case(".debug_macinfo", eSectionTypeDWARFDebugMacInfo)
-          .Case(".debug_names", eSectionTypeDWARFDebugNames)
-          .Case(".debug_pubnames", eSectionTypeDWARFDebugPubNames)
-          .Case(".debug_pubtypes", eSectionTypeDWARFDebugPubTypes)
-          .Case(".debug_ranges", eSectionTypeDWARFDebugRanges)
-          .Case(".debug_str", eSectionTypeDWARFDebugStr)
-          .Case(".debug_types", eSectionTypeDWARFDebugTypes)
           // .eh_frame can be truncated to 8 chars.
           .Cases(".eh_frame", ".eh_fram", eSectionTypeEHFrame)
           .Case(".gosymtab", eSectionTypeGoSymtab)
diff --git a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp
index 06eb6ff..67963a7 100644
--- a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp
+++ b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp
@@ -252,37 +252,7 @@ void ObjectFileWasm::ParseSymtab(Symtab &symtab) {}
 
 static SectionType GetSectionTypeFromName(llvm::StringRef Name) {
   if (Name.consume_front(".debug_") || Name.consume_front(".zdebug_")) {
-    return llvm::StringSwitch<SectionType>(Name)
-        .Case("abbrev", eSectionTypeDWARFDebugAbbrev)
-        .Case("abbrev.dwo", eSectionTypeDWARFDebugAbbrevDwo)
-        .Case("addr", eSectionTypeDWARFDebugAddr)
-        .Case("aranges", eSectionTypeDWARFDebugAranges)
-        .Case("cu_index", eSectionTypeDWARFDebugCuIndex)
-        .Case("frame", eSectionTypeDWARFDebugFrame)
-        .Case("info", eSectionTypeDWARFDebugInfo)
-        .Case("info.dwo", eSectionTypeDWARFDebugInfoDwo)
-        .Cases("line", "line.dwo", eSectionTypeDWARFDebugLine)
-        .Cases("line_str", "line_str.dwo", eSectionTypeDWARFDebugLineStr)
-        .Case("loc", eSectionTypeDWARFDebugLoc)
-        .Case("loc.dwo", eSectionTypeDWARFDebugLocDwo)
-        .Case("loclists", eSectionTypeDWARFDebugLocLists)
-        .Case("loclists.dwo", eSectionTypeDWARFDebugLocListsDwo)
-        .Case("macinfo", eSectionTypeDWARFDebugMacInfo)
-        .Cases("macro", "macro.dwo", eSectionTypeDWARFDebugMacro)
-        .Case("names", eSectionTypeDWARFDebugNames)
-        .Case("pubnames", eSectionTypeDWARFDebugPubNames)
-        .Case("pubtypes", eSectionTypeDWARFDebugPubTypes)
-        .Case("ranges", eSectionTypeDWARFDebugRanges)
-        .Case("rnglists", eSectionTypeDWARFDebugRngLists)
-        .Case("rnglists.dwo", eSectionTypeDWARFDebugRngListsDwo)
-        .Case("str", eSectionTypeDWARFDebugStr)
-        .Case("str.dwo", eSectionTypeDWARFDebugStrDwo)
-        .Case("str_offsets", eSectionTypeDWARFDebugStrOffsets)
-        .Case("str_offsets.dwo", eSectionTypeDWARFDebugStrOffsetsDwo)
-        .Case("tu_index", eSectionTypeDWARFDebugTuIndex)
-        .Case("types", eSectionTypeDWARFDebugTypes)
-        .Case("types.dwo", eSectionTypeDWARFDebugTypesDwo)
-        .Default(eSectionTypeOther);
+    return ObjectFile::GetDWARFSectionTypeFromName(Name);
   }
   return eSectionTypeOther;
 }
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 420c84b..f18bdd5 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -1728,7 +1728,7 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo(
 
   reg_ctx_sp->InvalidateIfNeeded(true);
 
-  auto iter = std::find(m_thread_ids.begin(), m_thread_ids.end(), tid);
+  auto iter = llvm::find(m_thread_ids, tid);
   if (iter != m_thread_ids.end())
     SetThreadPc(thread_sp, iter - m_thread_ids.begin());
 
diff --git a/lldb/source/Symbol/ObjectFile.cpp b/lldb/source/Symbol/ObjectFile.cpp
index afd0d29..21daf74 100644
--- a/lldb/source/Symbol/ObjectFile.cpp
+++ b/lldb/source/Symbol/ObjectFile.cpp
@@ -635,6 +635,41 @@ ObjectFile::GetSymbolTypeFromName(llvm::StringRef name,
   return symbol_type_hint;
 }
 
+lldb::SectionType
+ObjectFile::GetDWARFSectionTypeFromName(llvm::StringRef name) {
+  return llvm::StringSwitch<SectionType>(name)
+      .Case("abbrev", eSectionTypeDWARFDebugAbbrev)
+      .Case("abbrev.dwo", eSectionTypeDWARFDebugAbbrevDwo)
+      .Case("addr", eSectionTypeDWARFDebugAddr)
+      .Case("aranges", eSectionTypeDWARFDebugAranges)
+      .Case("cu_index", eSectionTypeDWARFDebugCuIndex)
+      .Case("frame", eSectionTypeDWARFDebugFrame)
+      .Case("info", eSectionTypeDWARFDebugInfo)
+      .Case("info.dwo", eSectionTypeDWARFDebugInfoDwo)
+      .Cases("line", "line.dwo", eSectionTypeDWARFDebugLine)
+      .Cases("line_str", "line_str.dwo", eSectionTypeDWARFDebugLineStr)
+      .Case("loc", eSectionTypeDWARFDebugLoc)
+      .Case("loc.dwo", eSectionTypeDWARFDebugLocDwo)
+      .Case("loclists", eSectionTypeDWARFDebugLocLists)
+      .Case("loclists.dwo", eSectionTypeDWARFDebugLocListsDwo)
+      .Case("macinfo", eSectionTypeDWARFDebugMacInfo)
+      .Cases("macro", "macro.dwo", eSectionTypeDWARFDebugMacro)
+      .Case("names", eSectionTypeDWARFDebugNames)
+      .Case("pubnames", eSectionTypeDWARFDebugPubNames)
+      .Case("pubtypes", eSectionTypeDWARFDebugPubTypes)
+      .Case("ranges", eSectionTypeDWARFDebugRanges)
+      .Case("rnglists", eSectionTypeDWARFDebugRngLists)
+      .Case("rnglists.dwo", eSectionTypeDWARFDebugRngListsDwo)
+      .Case("str", eSectionTypeDWARFDebugStr)
+      .Case("str.dwo", eSectionTypeDWARFDebugStrDwo)
+      .Case("str_offsets", eSectionTypeDWARFDebugStrOffsets)
+      .Case("str_offsets.dwo", eSectionTypeDWARFDebugStrOffsetsDwo)
+      .Case("tu_index", eSectionTypeDWARFDebugTuIndex)
+      .Case("types", eSectionTypeDWARFDebugTypes)
+      .Case("types.dwo", eSectionTypeDWARFDebugTypesDwo)
+      .Default(eSectionTypeOther);
+}
+
 std::vector<ObjectFile::LoadableData>
 ObjectFile::GetLoadableData(Target &target) {
   std::vector<LoadableData> loadables;
diff --git a/lldb/source/Symbol/UnwindPlan.cpp b/lldb/source/Symbol/UnwindPlan.cpp
index b1a96b5..e9ac6b6 100644
--- a/lldb/source/Symbol/UnwindPlan.cpp
+++ b/lldb/source/Symbol/UnwindPlan.cpp
@@ -87,8 +87,10 @@ static void DumpDWARFExpr(Stream &s, llvm::ArrayRef<uint8_t> expr, Thread *threa
   if (auto order_and_width = GetByteOrderAndAddrSize(thread)) {
     llvm::DataExtractor data(expr, order_and_width->first == eByteOrderLittle,
                              order_and_width->second);
-    llvm::DWARFExpression(data, order_and_width->second, llvm::dwarf::DWARF32)
-        .print(s.AsRawOstream(), llvm::DIDumpOptions(), nullptr);
+    llvm::DWARFExpression E(data, order_and_width->second,
+                            llvm::dwarf::DWARF32);
+    llvm::DWARFExpressionPrinter::print(&E, s.AsRawOstream(),
+                                        llvm::DIDumpOptions(), nullptr);
   } else
     s.PutCString("dwarf-expr");
 }
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index 58edf97..61a3d05 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -5838,7 +5838,7 @@ void Process::ClearPreResumeActions() { m_pre_resume_actions.clear(); }
 void Process::ClearPreResumeAction(PreResumeActionCallback callback, void *baton)
 {
     PreResumeCallbackAndBaton element(callback, baton);
-    auto found_iter = std::find(m_pre_resume_actions.begin(), m_pre_resume_actions.end(), element);
+    auto found_iter = llvm::find(m_pre_resume_actions, element);
     if (found_iter != m_pre_resume_actions.end())
     {
         m_pre_resume_actions.erase(found_iter);
diff --git a/lldb/source/Target/RegisterContextUnwind.cpp b/lldb/source/Target/RegisterContextUnwind.cpp
index 32ed8e0..880300d0 100644
--- a/lldb/source/Target/RegisterContextUnwind.cpp
+++ b/lldb/source/Target/RegisterContextUnwind.cpp
@@ -1233,247 +1233,201 @@ bool RegisterContextUnwind::IsTrapHandlerSymbol(
   return false;
 }
 
-// Answer the question: Where did THIS frame save the CALLER frame ("previous"
-// frame)'s register value?
-
-enum UnwindLLDB::RegisterSearchResult
-RegisterContextUnwind::SavedLocationForRegister(
-    uint32_t lldb_regnum,
-    lldb_private::UnwindLLDB::ConcreteRegisterLocation &regloc) {
+// Search this stack frame's UnwindPlans for the AbstractRegisterLocation
+// for this register.
+//
+// \param[in] lldb_regnum
+//     The register number (in the eRegisterKindLLDB register numbering)
+//     we are searching for.
+//
+// \param[out] kind
+//     Set to the RegisterKind of the UnwindPlan which is the basis for
+//     the returned AbstractRegisterLocation; if the location is in terms
+//     of another register number, this Kind is needed to interpret it
+//     correctly.
+//
+// \return
+//     An empty optional indicaTes that there was an error in processing
+//     the request.
+//
+//     If there is no unwind rule for a volatile (caller-preserved) register,
+//     the returned AbstractRegisterLocation will be IsUndefined,
+//     indicating that we should stop searching.
+//
+//     If there is no unwind rule for a non-volatile (callee-preserved)
+//     register, the returned AbstractRegisterLocation will be IsSame.
+//     In frame 0, IsSame means get the value from the live register context.
+//     Else it means to continue descending down the stack to more-live frames
+//     looking for a location/value.
+//
+//     If an AbstractRegisterLocation is found in an UnwindPlan, that will
+//     be returned, with no consideration of the current ABI rules for
+//     registers.  Functions using an alternate ABI calling convention
+//     will work as long as the UnwindPlans are exhaustive about what
+//     registers are volatile/non-volatile.
+std::optional<UnwindPlan::Row::AbstractRegisterLocation>
+RegisterContextUnwind::GetAbstractRegisterLocation(uint32_t lldb_regnum,
+                                                   lldb::RegisterKind &kind) {
   RegisterNumber regnum(m_thread, eRegisterKindLLDB, lldb_regnum);
   Log *log = GetLog(LLDBLog::Unwind);
 
-  // Have we already found this register location?
-  if (!m_registers.empty()) {
-    std::map<uint32_t,
-             lldb_private::UnwindLLDB::ConcreteRegisterLocation>::const_iterator
-        iterator;
-    iterator = m_registers.find(regnum.GetAsKind(eRegisterKindLLDB));
-    if (iterator != m_registers.end()) {
-      regloc = iterator->second;
-      UnwindLogMsg("supplying caller's saved %s (%d)'s location, cached",
-                   regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB));
-      return UnwindLLDB::RegisterSearchResult::eRegisterFound;
-    }
-  }
-
-  // Look through the available UnwindPlans for the register location.
-
+  kind = eRegisterKindLLDB;
   UnwindPlan::Row::AbstractRegisterLocation unwindplan_regloc;
-  bool have_unwindplan_regloc = false;
-  RegisterKind unwindplan_registerkind = kNumRegisterKinds;
 
+  // First, try to find a register location via the FastUnwindPlan
   if (m_fast_unwind_plan_sp) {
     const UnwindPlan::Row *active_row =
         m_fast_unwind_plan_sp->GetRowForFunctionOffset(m_current_offset);
-    unwindplan_registerkind = m_fast_unwind_plan_sp->GetRegisterKind();
-    if (regnum.GetAsKind(unwindplan_registerkind) == LLDB_INVALID_REGNUM) {
+    if (regnum.GetAsKind(kind) == LLDB_INVALID_REGNUM) {
       UnwindLogMsg("could not convert lldb regnum %s (%d) into %d RegisterKind "
                    "reg numbering scheme",
                    regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB),
-                   (int)unwindplan_registerkind);
-      return UnwindLLDB::RegisterSearchResult::eRegisterNotFound;
+                   (int)kind);
+      return {};
     }
-    // The architecture default unwind plan marks unknown registers as
-    // Undefined so that we don't forward them up the stack when a
-    // jitted stack frame may have overwritten them.  But when the
-    // arch default unwind plan is used as the Fast Unwind Plan, we
-    // need to recognize this & switch over to the Full Unwind Plan
-    // to see what unwind rule that (more knoweldgeable, probably)
-    // UnwindPlan has.  If the full UnwindPlan says the register
-    // location is Undefined, then it really is.
-    if (active_row->GetRegisterInfo(regnum.GetAsKind(unwindplan_registerkind),
+    kind = m_fast_unwind_plan_sp->GetRegisterKind();
+    // The Fast UnwindPlan typically only provides fp & pc as we move up
+    // the stack, without requiring additional parsing or memory reads.
+    // It may mark all other registers as IsUndefined() because, indicating
+    // that it doesn't know if they were spilled to stack or not.
+    // If this case, for an IsUndefined register, we should continue on
+    // to the Full UnwindPlan which may have more accurate information
+    // about register locations of all registers.
+    if (active_row &&
+        active_row->GetRegisterInfo(regnum.GetAsKind(kind),
                                     unwindplan_regloc) &&
         !unwindplan_regloc.IsUndefined()) {
       UnwindLogMsg(
           "supplying caller's saved %s (%d)'s location using FastUnwindPlan",
           regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB));
-      have_unwindplan_regloc = true;
+      return unwindplan_regloc;
     }
   }
 
-  if (!have_unwindplan_regloc) {
-    // m_full_unwind_plan_sp being NULL means that we haven't tried to find a
-    // full UnwindPlan yet
-    bool got_new_full_unwindplan = false;
-    if (!m_full_unwind_plan_sp) {
-      m_full_unwind_plan_sp = GetFullUnwindPlanForFrame();
-      got_new_full_unwindplan = true;
+  // Second, try to find a register location via the FullUnwindPlan.
+  bool got_new_full_unwindplan = false;
+  if (!m_full_unwind_plan_sp) {
+    m_full_unwind_plan_sp = GetFullUnwindPlanForFrame();
+    got_new_full_unwindplan = true;
+  }
+  if (m_full_unwind_plan_sp) {
+    RegisterNumber pc_regnum(m_thread, eRegisterKindGeneric,
+                             LLDB_REGNUM_GENERIC_PC);
+
+    const UnwindPlan::Row *active_row =
+        m_full_unwind_plan_sp->GetRowForFunctionOffset(
+            m_current_offset_backed_up_one);
+    kind = m_full_unwind_plan_sp->GetRegisterKind();
+
+    if (got_new_full_unwindplan && active_row && log) {
+      StreamString active_row_strm;
+      ExecutionContext exe_ctx(m_thread.shared_from_this());
+      active_row->Dump(active_row_strm, m_full_unwind_plan_sp.get(), &m_thread,
+                       m_start_pc.GetLoadAddress(exe_ctx.GetTargetPtr()));
+      UnwindLogMsg("Using full unwind plan '%s'",
+                   m_full_unwind_plan_sp->GetSourceName().AsCString());
+      UnwindLogMsg("active row: %s", active_row_strm.GetData());
     }
 
-    if (m_full_unwind_plan_sp) {
-      RegisterNumber pc_regnum(m_thread, eRegisterKindGeneric,
-                               LLDB_REGNUM_GENERIC_PC);
+    if (regnum.GetAsKind(kind) == LLDB_INVALID_REGNUM) {
+      if (kind == eRegisterKindGeneric)
+        UnwindLogMsg("could not convert lldb regnum %s (%d) into "
+                     "eRegisterKindGeneric reg numbering scheme",
+                     regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB));
+      else
+        UnwindLogMsg("could not convert lldb regnum %s (%d) into %d "
+                     "RegisterKind reg numbering scheme",
+                     regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB),
+                     (int)kind);
+      return {};
+    }
 
-      const UnwindPlan::Row *active_row =
-          m_full_unwind_plan_sp->GetRowForFunctionOffset(
-              m_current_offset_backed_up_one);
-      unwindplan_registerkind = m_full_unwind_plan_sp->GetRegisterKind();
+    if (regnum.IsValid() && active_row &&
+        active_row->GetRegisterInfo(regnum.GetAsKind(kind),
+                                    unwindplan_regloc)) {
+      UnwindLogMsg(
+          "supplying caller's saved %s (%d)'s location using %s UnwindPlan",
+          regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB),
+          m_full_unwind_plan_sp->GetSourceName().GetCString());
+      return unwindplan_regloc;
+    }
 
-      if (got_new_full_unwindplan && active_row && log) {
-        StreamString active_row_strm;
-        ExecutionContext exe_ctx(m_thread.shared_from_this());
-        active_row->Dump(active_row_strm, m_full_unwind_plan_sp.get(),
-                         &m_thread,
-                         m_start_pc.GetLoadAddress(exe_ctx.GetTargetPtr()));
-        UnwindLogMsg("Using full unwind plan '%s'",
-                     m_full_unwind_plan_sp->GetSourceName().AsCString());
-        UnwindLogMsg("active row: %s", active_row_strm.GetData());
-      }
-      RegisterNumber return_address_reg;
-
-      // If we're fetching the saved pc and this UnwindPlan defines a
-      // ReturnAddress register (e.g. lr on arm), look for the return address
-      // register number in the UnwindPlan's row.
-      if (pc_regnum.IsValid() && pc_regnum == regnum &&
-          m_full_unwind_plan_sp->GetReturnAddressRegister() !=
-              LLDB_INVALID_REGNUM) {
-        // If this is a trap handler frame, we should have access to
-        // the complete register context when the interrupt/async
-        // signal was received, we should fetch the actual saved $pc
-        // value instead of the Return Address register.
-        // If $pc is not available, fall back to the RA reg.
-        UnwindPlan::Row::AbstractRegisterLocation scratch;
-        if (m_frame_type == eTrapHandlerFrame && active_row &&
-            active_row->GetRegisterInfo(
-                pc_regnum.GetAsKind(unwindplan_registerkind), scratch)) {
-          UnwindLogMsg("Providing pc register instead of rewriting to "
-                       "RA reg because this is a trap handler and there is "
-                       "a location for the saved pc register value.");
-        } else {
-          return_address_reg.init(
-              m_thread, m_full_unwind_plan_sp->GetRegisterKind(),
-              m_full_unwind_plan_sp->GetReturnAddressRegister());
-          regnum = return_address_reg;
-          UnwindLogMsg("requested caller's saved PC but this UnwindPlan uses a "
-                       "RA reg; getting %s (%d) instead",
-                       return_address_reg.GetName(),
-                       return_address_reg.GetAsKind(eRegisterKindLLDB));
-        }
+    // When asking for the caller's pc, and did not find a register
+    // location for PC above in the UnwindPlan.  Check if we have a
+    // Return Address register on this target.
+    //
+    // On a Return Address Register architecture like arm/mips/riscv,
+    // the caller's pc is in the RA register, and will be spilled to
+    // stack before any other function is called.  If no function
+    // has been called yet, the return address may still be in the
+    // live RA reg.
+    //
+    // There's a lot of variety of what we might see in an UnwindPlan.
+    // We may have
+    //   ra=IsSame {unncessary}
+    //   ra=StackAddr {caller's return addr spilled to stack}
+    // or no reg location for pc or ra at all, in a frameless function -
+    // the caller's return address is in live ra reg.
+    //
+    // If a function has been interrupted in a non-call way --
+    // async signal/sigtramp, or a hardware exception / interrupt / fault --
+    // then the "pc" and "ra" are two distinct values, and must be
+    // handled separately.  The "pc" is the pc value at the point
+    // the function was interrupted.  The "ra" is the return address
+    // register value at that point.
+    // The UnwindPlan for the sigtramp/trap handler will normally have
+    // register loations for both pc and lr, and so we'll have already
+    // fetched them above.
+    if (pc_regnum.IsValid() && pc_regnum == regnum) {
+      uint32_t return_address_regnum = LLDB_INVALID_REGNUM;
+
+      // Get the return address register number from the UnwindPlan
+      // or the register set definition.
+      if (m_full_unwind_plan_sp->GetReturnAddressRegister() !=
+          LLDB_INVALID_REGNUM) {
+        return_address_regnum =
+            m_full_unwind_plan_sp->GetReturnAddressRegister();
       } else {
-        if (regnum.GetAsKind(unwindplan_registerkind) == LLDB_INVALID_REGNUM) {
-          if (unwindplan_registerkind == eRegisterKindGeneric) {
-            UnwindLogMsg("could not convert lldb regnum %s (%d) into "
-                         "eRegisterKindGeneric reg numbering scheme",
-                         regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB));
-          } else {
-            UnwindLogMsg("could not convert lldb regnum %s (%d) into %d "
-                         "RegisterKind reg numbering scheme",
-                         regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB),
-                         (int)unwindplan_registerkind);
-          }
-          return UnwindLLDB::RegisterSearchResult::eRegisterNotFound;
-        }
-      }
-
-      // Check if the active_row has a register location listed.
-      if (regnum.IsValid() && active_row &&
-          active_row->GetRegisterInfo(regnum.GetAsKind(unwindplan_registerkind),
-                                      unwindplan_regloc)) {
-        have_unwindplan_regloc = true;
-        UnwindLogMsg(
-            "supplying caller's saved %s (%d)'s location using %s UnwindPlan",
-            regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB),
-            m_full_unwind_plan_sp->GetSourceName().GetCString());
+        RegisterNumber arch_default_ra_regnum(m_thread, eRegisterKindGeneric,
+                                              LLDB_REGNUM_GENERIC_RA);
+        return_address_regnum = arch_default_ra_regnum.GetAsKind(kind);
       }
 
-      // This is frame 0 and we're retrieving the PC and it's saved in a Return
-      // Address register and it hasn't been saved anywhere yet -- that is,
-      // it's still live in the actual register. Handle this specially.
-      if (!have_unwindplan_regloc && return_address_reg.IsValid() &&
-          return_address_reg.GetAsKind(eRegisterKindLLDB) !=
-              LLDB_INVALID_REGNUM) {
-        if (IsFrameZero()) {
-          lldb_private::UnwindLLDB::ConcreteRegisterLocation new_regloc;
-          new_regloc.type = UnwindLLDB::ConcreteRegisterLocation::
-              eRegisterInLiveRegisterContext;
-          new_regloc.location.register_number =
-              return_address_reg.GetAsKind(eRegisterKindLLDB);
-          m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = new_regloc;
-          regloc = new_regloc;
-          UnwindLogMsg("supplying caller's register %s (%d) from the live "
-                       "RegisterContext at frame 0, saved in %d",
+      // This system is using a return address register.
+      if (return_address_regnum != LLDB_INVALID_REGNUM) {
+        RegisterNumber return_address_reg;
+        return_address_reg.init(m_thread,
+                                m_full_unwind_plan_sp->GetRegisterKind(),
+                                return_address_regnum);
+        UnwindLogMsg("requested caller's saved PC but this UnwindPlan uses a "
+                     "RA reg; getting %s (%d) instead",
+                     return_address_reg.GetName(),
+                     return_address_reg.GetAsKind(eRegisterKindLLDB));
+
+        // Do we have a location for the ra register?
+        if (active_row &&
+            active_row->GetRegisterInfo(return_address_reg.GetAsKind(kind),
+                                        unwindplan_regloc)) {
+          UnwindLogMsg("supplying caller's saved %s (%d)'s location using "
+                       "%s UnwindPlan",
                        return_address_reg.GetName(),
                        return_address_reg.GetAsKind(eRegisterKindLLDB),
-                       return_address_reg.GetAsKind(eRegisterKindLLDB));
-          return UnwindLLDB::RegisterSearchResult::eRegisterFound;
-        } else if (BehavesLikeZerothFrame()) {
-          // This function was interrupted asynchronously -- it faulted,
-          // an async interrupt, a timer fired, a debugger expression etc.
-          // The caller's pc is in the Return Address register, but the
-          // UnwindPlan for this function may have no location rule for
-          // the RA reg.
-          // This means that the caller's return address is in the RA reg
-          // when the function was interrupted--descend down one stack frame
-          // to retrieve it from the trap handler's saved context.
-          unwindplan_regloc.SetSame();
-          have_unwindplan_regloc = true;
-        }
-      }
-
-      // If this architecture stores the return address in a register (it
-      // defines a Return Address register) and we're on a non-zero stack frame
-      // and the Full UnwindPlan says that the pc is stored in the
-      // RA registers (e.g. lr on arm), then we know that the full unwindplan is
-      // not trustworthy -- this
-      // is an impossible situation and the instruction emulation code has
-      // likely been misled. If this stack frame meets those criteria, we need
-      // to throw away the Full UnwindPlan that the instruction emulation came
-      // up with and fall back to the architecture's Default UnwindPlan so the
-      // stack walk can get past this point.
-
-      // Special note:  If the Full UnwindPlan was generated from the compiler,
-      // don't second-guess it when we're at a call site location.
-
-      // arch_default_ra_regnum is the return address register # in the Full
-      // UnwindPlan register numbering
-      RegisterNumber arch_default_ra_regnum(m_thread, eRegisterKindGeneric,
-                                            LLDB_REGNUM_GENERIC_RA);
-
-      if (arch_default_ra_regnum.GetAsKind(unwindplan_registerkind) !=
-              LLDB_INVALID_REGNUM &&
-          pc_regnum == regnum && unwindplan_regloc.IsInOtherRegister() &&
-          unwindplan_regloc.GetRegisterNumber() ==
-              arch_default_ra_regnum.GetAsKind(unwindplan_registerkind) &&
-          m_full_unwind_plan_sp->GetSourcedFromCompiler() != eLazyBoolYes &&
-          !m_all_registers_available) {
-        UnwindLogMsg("%s UnwindPlan tried to restore the pc from the link "
-                     "register but this is a non-zero frame",
-                     m_full_unwind_plan_sp->GetSourceName().GetCString());
-
-        // Throw away the full unwindplan; install the arch default unwindplan
-        if (ForceSwitchToFallbackUnwindPlan()) {
-          // Update for the possibly new unwind plan
-          unwindplan_registerkind = m_full_unwind_plan_sp->GetRegisterKind();
-          const UnwindPlan::Row *active_row =
-              m_full_unwind_plan_sp->GetRowForFunctionOffset(m_current_offset);
-
-          // Sanity check: Verify that we can fetch a pc value and CFA value
-          // with this unwind plan
-
-          RegisterNumber arch_default_pc_reg(m_thread, eRegisterKindGeneric,
-                                             LLDB_REGNUM_GENERIC_PC);
-          bool can_fetch_pc_value = false;
-          bool can_fetch_cfa = false;
-          addr_t cfa_value;
-          if (active_row) {
-            if (arch_default_pc_reg.GetAsKind(unwindplan_registerkind) !=
-                    LLDB_INVALID_REGNUM &&
-                active_row->GetRegisterInfo(
-                    arch_default_pc_reg.GetAsKind(unwindplan_registerkind),
-                    unwindplan_regloc)) {
-              can_fetch_pc_value = true;
-            }
-            if (ReadFrameAddress(unwindplan_registerkind,
-                                 active_row->GetCFAValue(), cfa_value)) {
-              can_fetch_cfa = true;
-            }
-          }
-
-          have_unwindplan_regloc = can_fetch_pc_value && can_fetch_cfa;
+                       m_full_unwind_plan_sp->GetSourceName().GetCString());
+          // If we have "ra=IsSame", rewrite to "ra=InRegister(ra)" because the
+          // calling function thinks it is fetching "pc" and if we return an
+          // IsSame register location, it will try to read pc.
+          if (unwindplan_regloc.IsSame())
+            unwindplan_regloc.SetInRegister(return_address_reg.GetAsKind(kind));
+          return unwindplan_regloc;
         } else {
-          // We were unable to fall back to another unwind plan
-          have_unwindplan_regloc = false;
+          // No unwind rule for the return address reg on frame 0, or an
+          // interrupted function, means that the caller's address is still in
+          // RA reg (0th frame) or the trap handler below this one (sigtramp
+          // etc) has a save location for the RA reg.
+          if (BehavesLikeZerothFrame()) {
+            unwindplan_regloc.SetInRegister(return_address_reg.GetAsKind(kind));
+            return unwindplan_regloc;
+          }
         }
       }
     }
@@ -1481,55 +1435,86 @@ RegisterContextUnwind::SavedLocationForRegister(
 
   ExecutionContext exe_ctx(m_thread.shared_from_this());
   Process *process = exe_ctx.GetProcessPtr();
-  if (!have_unwindplan_regloc) {
-    // If the UnwindPlan failed to give us an unwind location for this
-    // register, we may be able to fall back to some ABI-defined default.  For
-    // example, some ABIs allow to determine the caller's SP via the CFA. Also,
-    // the ABI may set volatile registers to the undefined state.
-    ABI *abi = process ? process->GetABI().get() : nullptr;
-    if (abi) {
-      const RegisterInfo *reg_info =
-          GetRegisterInfoAtIndex(regnum.GetAsKind(eRegisterKindLLDB));
-      if (reg_info &&
-          abi->GetFallbackRegisterLocation(reg_info, unwindplan_regloc)) {
+
+  // Third, try finding a register location via the ABI
+  // FallbackRegisterLocation.
+  //
+  // If the UnwindPlan failed to give us an unwind location for this
+  // register, we may be able to fall back to some ABI-defined default.  For
+  // example, some ABIs allow to determine the caller's SP via the CFA. Also,
+  // the ABI willset volatile registers to the undefined state.
+  ABI *abi = process ? process->GetABI().get() : nullptr;
+  if (abi) {
+    const RegisterInfo *reg_info =
+        GetRegisterInfoAtIndex(regnum.GetAsKind(eRegisterKindLLDB));
+    if (reg_info &&
+        abi->GetFallbackRegisterLocation(reg_info, unwindplan_regloc)) {
+      if (!unwindplan_regloc.IsUndefined())
         UnwindLogMsg(
             "supplying caller's saved %s (%d)'s location using ABI default",
             regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB));
-        have_unwindplan_regloc = true;
-      }
+      // ABI defined volatile registers with no register location
+      // will be returned as IsUndefined, stopping the search down
+      // the stack.
+      return unwindplan_regloc;
     }
   }
 
-  if (!have_unwindplan_regloc) {
-    if (IsFrameZero()) {
-      // This is frame 0 - we should return the actual live register context
-      // value
-      lldb_private::UnwindLLDB::ConcreteRegisterLocation new_regloc;
-      new_regloc.type =
-          UnwindLLDB::ConcreteRegisterLocation::eRegisterInLiveRegisterContext;
-      new_regloc.location.register_number = regnum.GetAsKind(eRegisterKindLLDB);
-      m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = new_regloc;
-      regloc = new_regloc;
-      UnwindLogMsg("supplying caller's register %s (%d) from the live "
-                   "RegisterContext at frame 0",
+  // We have no AbstractRegisterLocation, and the ABI says this is a
+  // non-volatile / callee-preserved register.  Continue down the stack
+  // or to frame 0 & the live RegisterContext.
+  std::string unwindplan_name;
+  if (m_full_unwind_plan_sp) {
+    unwindplan_name += "via '";
+    unwindplan_name += m_full_unwind_plan_sp->GetSourceName().AsCString();
+    unwindplan_name += "'";
+  }
+  UnwindLogMsg("no save location for %s (%d) %s", regnum.GetName(),
+               regnum.GetAsKind(eRegisterKindLLDB), unwindplan_name.c_str());
+
+  unwindplan_regloc.SetSame();
+  return unwindplan_regloc;
+}
+
+// Answer the question: Where did THIS frame save the CALLER frame ("previous"
+// frame)'s register value?
+
+enum UnwindLLDB::RegisterSearchResult
+RegisterContextUnwind::SavedLocationForRegister(
+    uint32_t lldb_regnum,
+    lldb_private::UnwindLLDB::ConcreteRegisterLocation &regloc) {
+  RegisterNumber regnum(m_thread, eRegisterKindLLDB, lldb_regnum);
+  Log *log = GetLog(LLDBLog::Unwind);
+
+  // Have we already found this register location?
+  if (!m_registers.empty()) {
+    auto iterator = m_registers.find(regnum.GetAsKind(eRegisterKindLLDB));
+    if (iterator != m_registers.end()) {
+      regloc = iterator->second;
+      UnwindLogMsg("supplying caller's saved %s (%d)'s location, cached",
                    regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB));
       return UnwindLLDB::RegisterSearchResult::eRegisterFound;
-    } else {
-      std::string unwindplan_name;
-      if (m_full_unwind_plan_sp) {
-        unwindplan_name += "via '";
-        unwindplan_name += m_full_unwind_plan_sp->GetSourceName().AsCString();
-        unwindplan_name += "'";
-      }
-      UnwindLogMsg("no save location for %s (%d) %s", regnum.GetName(),
-                   regnum.GetAsKind(eRegisterKindLLDB),
-                   unwindplan_name.c_str());
     }
+  }
+
+  RegisterKind abs_regkind;
+  std::optional<UnwindPlan::Row::AbstractRegisterLocation> abs_regloc =
+      GetAbstractRegisterLocation(lldb_regnum, abs_regkind);
+
+  if (!abs_regloc)
     return UnwindLLDB::RegisterSearchResult::eRegisterNotFound;
+
+  if (abs_regloc->IsUndefined()) {
+    UnwindLogMsg(
+        "did not supply reg location for %s (%d) because it is volatile",
+        regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB));
+    return UnwindLLDB::RegisterSearchResult::eRegisterIsVolatile;
   }
 
-  // unwindplan_regloc has valid contents about where to retrieve the register
-  if (unwindplan_regloc.IsUnspecified()) {
+  ExecutionContext exe_ctx(m_thread.shared_from_this());
+  Process *process = exe_ctx.GetProcessPtr();
+  // abs_regloc has valid contents about where to retrieve the register
+  if (abs_regloc->IsUnspecified()) {
     lldb_private::UnwindLLDB::ConcreteRegisterLocation new_regloc = {};
     new_regloc.type = UnwindLLDB::ConcreteRegisterLocation::eRegisterNotSaved;
     m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = new_regloc;
@@ -1538,15 +1523,23 @@ RegisterContextUnwind::SavedLocationForRegister(
     return UnwindLLDB::RegisterSearchResult::eRegisterNotFound;
   }
 
-  if (unwindplan_regloc.IsUndefined()) {
-    UnwindLogMsg(
-        "did not supply reg location for %s (%d) because it is volatile",
-        regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB));
-    return UnwindLLDB::RegisterSearchResult::eRegisterIsVolatile;
-  }
-
-  if (unwindplan_regloc.IsSame()) {
-    if (!m_all_registers_available &&
+  if (abs_regloc->IsSame()) {
+    if (IsFrameZero()) {
+      regloc.type =
+          UnwindLLDB::ConcreteRegisterLocation::eRegisterInLiveRegisterContext;
+      regloc.location.register_number = regnum.GetAsKind(eRegisterKindLLDB);
+      m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = regloc;
+      UnwindLogMsg("supplying caller's register %s (%d) from the live "
+                   "RegisterContext at frame 0",
+                   regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB));
+      return UnwindLLDB::RegisterSearchResult::eRegisterFound;
+    }
+    // PC/RA reg don't follow the usual "callee-saved aka non-volatile" versus
+    // "caller saved aka volatile" system.  A stack frame can provide its caller
+    // return address, but if we don't find a rule for pc/RA mid-stack, we
+    // never want to iterate further down the stack looking for it.
+    // Defensively prevent iterating down the stack for these two.
+    if (!BehavesLikeZerothFrame() &&
         (regnum.GetAsKind(eRegisterKindGeneric) == LLDB_REGNUM_GENERIC_PC ||
          regnum.GetAsKind(eRegisterKindGeneric) == LLDB_REGNUM_GENERIC_RA)) {
       UnwindLogMsg("register %s (%d) is marked as 'IsSame' - it is a pc or "
@@ -1554,20 +1547,19 @@ RegisterContextUnwind::SavedLocationForRegister(
                    "registers available -- treat as if we have no information",
                    regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB));
       return UnwindLLDB::RegisterSearchResult::eRegisterNotFound;
-    } else {
-      regloc.type = UnwindLLDB::ConcreteRegisterLocation::eRegisterInRegister;
-      regloc.location.register_number = regnum.GetAsKind(eRegisterKindLLDB);
-      m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = regloc;
-      UnwindLogMsg(
-          "supplying caller's register %s (%d), saved in register %s (%d)",
-          regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB),
-          regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB));
-      return UnwindLLDB::RegisterSearchResult::eRegisterFound;
     }
+
+    regloc.type = UnwindLLDB::ConcreteRegisterLocation::eRegisterInRegister;
+    regloc.location.register_number = regnum.GetAsKind(eRegisterKindLLDB);
+    m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = regloc;
+    UnwindLogMsg(
+        "supplying caller's register %s (%d) value is unmodified in this frame",
+        regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB));
+    return UnwindLLDB::RegisterSearchResult::eRegisterFound;
   }
 
-  if (unwindplan_regloc.IsCFAPlusOffset()) {
-    int offset = unwindplan_regloc.GetOffset();
+  if (abs_regloc->IsCFAPlusOffset()) {
+    int offset = abs_regloc->GetOffset();
     regloc.type = UnwindLLDB::ConcreteRegisterLocation::eRegisterValueInferred;
     regloc.location.inferred_value = m_cfa + offset;
     m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = regloc;
@@ -1578,8 +1570,8 @@ RegisterContextUnwind::SavedLocationForRegister(
     return UnwindLLDB::RegisterSearchResult::eRegisterFound;
   }
 
-  if (unwindplan_regloc.IsAtCFAPlusOffset()) {
-    int offset = unwindplan_regloc.GetOffset();
+  if (abs_regloc->IsAtCFAPlusOffset()) {
+    int offset = abs_regloc->GetOffset();
     regloc.type =
         UnwindLLDB::ConcreteRegisterLocation::eRegisterSavedAtMemoryLocation;
     regloc.location.target_memory_location = m_cfa + offset;
@@ -1591,11 +1583,11 @@ RegisterContextUnwind::SavedLocationForRegister(
     return UnwindLLDB::RegisterSearchResult::eRegisterFound;
   }
 
-  if (unwindplan_regloc.IsAFAPlusOffset()) {
+  if (abs_regloc->IsAFAPlusOffset()) {
     if (m_afa == LLDB_INVALID_ADDRESS)
         return UnwindLLDB::RegisterSearchResult::eRegisterNotFound;
 
-    int offset = unwindplan_regloc.GetOffset();
+    int offset = abs_regloc->GetOffset();
     regloc.type = UnwindLLDB::ConcreteRegisterLocation::eRegisterValueInferred;
     regloc.location.inferred_value = m_afa + offset;
     m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = regloc;
@@ -1606,11 +1598,11 @@ RegisterContextUnwind::SavedLocationForRegister(
     return UnwindLLDB::RegisterSearchResult::eRegisterFound;
   }
 
-  if (unwindplan_regloc.IsAtAFAPlusOffset()) {
+  if (abs_regloc->IsAtAFAPlusOffset()) {
     if (m_afa == LLDB_INVALID_ADDRESS)
         return UnwindLLDB::RegisterSearchResult::eRegisterNotFound;
 
-    int offset = unwindplan_regloc.GetOffset();
+    int offset = abs_regloc->GetOffset();
     regloc.type =
         UnwindLLDB::ConcreteRegisterLocation::eRegisterSavedAtMemoryLocation;
     regloc.location.target_memory_location = m_afa + offset;
@@ -1622,10 +1614,9 @@ RegisterContextUnwind::SavedLocationForRegister(
     return UnwindLLDB::RegisterSearchResult::eRegisterFound;
   }
 
-  if (unwindplan_regloc.IsInOtherRegister()) {
-    uint32_t unwindplan_regnum = unwindplan_regloc.GetRegisterNumber();
-    RegisterNumber row_regnum(m_thread, unwindplan_registerkind,
-                              unwindplan_regnum);
+  if (abs_regloc->IsInOtherRegister()) {
+    RegisterNumber row_regnum(m_thread, abs_regkind,
+                              abs_regloc->GetRegisterNumber());
     if (row_regnum.GetAsKind(eRegisterKindLLDB) == LLDB_INVALID_REGNUM) {
       UnwindLogMsg("could not supply caller's %s (%d) location - was saved in "
                    "another reg but couldn't convert that regnum",
@@ -1642,16 +1633,14 @@ RegisterContextUnwind::SavedLocationForRegister(
     return UnwindLLDB::RegisterSearchResult::eRegisterFound;
   }
 
-  if (unwindplan_regloc.IsDWARFExpression() ||
-      unwindplan_regloc.IsAtDWARFExpression()) {
-    DataExtractor dwarfdata(unwindplan_regloc.GetDWARFExpressionBytes(),
-                            unwindplan_regloc.GetDWARFExpressionLength(),
+  if (abs_regloc->IsDWARFExpression() || abs_regloc->IsAtDWARFExpression()) {
+    DataExtractor dwarfdata(abs_regloc->GetDWARFExpressionBytes(),
+                            abs_regloc->GetDWARFExpressionLength(),
                             process->GetByteOrder(),
                             process->GetAddressByteSize());
     ModuleSP opcode_ctx;
     DWARFExpressionList dwarfexpr(opcode_ctx, dwarfdata, nullptr);
-    dwarfexpr.GetMutableExpressionAtAddress()->SetRegisterKind(
-        unwindplan_registerkind);
+    dwarfexpr.GetMutableExpressionAtAddress()->SetRegisterKind(abs_regkind);
     Value cfa_val = Scalar(m_cfa);
     cfa_val.SetValueType(Value::ValueType::LoadAddress);
     llvm::Expected<Value> result =
@@ -1662,7 +1651,7 @@ RegisterContextUnwind::SavedLocationForRegister(
     } else {
       addr_t val;
       val = result->GetScalar().ULongLong();
-      if (unwindplan_regloc.IsDWARFExpression()) {
+      if (abs_regloc->IsDWARFExpression()) {
         regloc.type =
             UnwindLLDB::ConcreteRegisterLocation::eRegisterValueInferred;
         regloc.location.inferred_value = val;
@@ -1688,9 +1677,9 @@ RegisterContextUnwind::SavedLocationForRegister(
     return UnwindLLDB::RegisterSearchResult::eRegisterNotFound;
   }
 
-  if (unwindplan_regloc.IsConstant()) {
+  if (abs_regloc->IsConstant()) {
     regloc.type = UnwindLLDB::ConcreteRegisterLocation::eRegisterValueInferred;
-    regloc.location.inferred_value = unwindplan_regloc.GetConstant();
+    regloc.location.inferred_value = abs_regloc->GetConstant();
     m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = regloc;
     UnwindLogMsg("supplying caller's register %s (%d) via constant value",
                  regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB));
diff --git a/lldb/source/Target/RegisterNumber.cpp b/lldb/source/Target/RegisterNumber.cpp
index e5610bf5..56dda8d 100644
--- a/lldb/source/Target/RegisterNumber.cpp
+++ b/lldb/source/Target/RegisterNumber.cpp
@@ -47,6 +47,7 @@ const RegisterNumber &RegisterNumber::operator=(const RegisterNumber &rhs) {
   m_reg_ctx_sp = rhs.m_reg_ctx_sp;
   m_regnum = rhs.m_regnum;
   m_kind = rhs.m_kind;
+  m_kind_regnum_map.clear();
   for (auto it : rhs.m_kind_regnum_map)
     m_kind_regnum_map[it.first] = it.second;
   m_name = rhs.m_name;
diff --git a/lldb/source/Target/Statistics.cpp b/lldb/source/Target/Statistics.cpp
index 4cfd062..6ec8f89 100644
--- a/lldb/source/Target/Statistics.cpp
+++ b/lldb/source/Target/Statistics.cpp
@@ -10,6 +10,7 @@
 
 #include "lldb/Core/Debugger.h"
 #include "lldb/Core/Module.h"
+#include "lldb/Core/PluginManager.h"
 #include "lldb/Interpreter/CommandInterpreter.h"
 #include "lldb/Symbol/SymbolFile.h"
 #include "lldb/Target/DynamicLoader.h"
@@ -294,6 +295,7 @@ llvm::json::Value DebuggerStats::ReportStatistics(
   const bool include_targets = options.GetIncludeTargets();
   const bool include_modules = options.GetIncludeModules();
   const bool include_transcript = options.GetIncludeTranscript();
+  const bool include_plugins = options.GetIncludePlugins();
 
   json::Array json_targets;
   json::Array json_modules;
@@ -489,6 +491,10 @@ llvm::json::Value DebuggerStats::ReportStatistics(
     }
   }
 
+  if (include_plugins) {
+    global_stats.try_emplace("plugins", PluginManager::GetJSON());
+  }
+
   return std::move(global_stats);
 }
 
diff --git a/lldb/source/Utility/Listener.cpp b/lldb/source/Utility/Listener.cpp
index 1efaad3..d4ce3bf 100644
--- a/lldb/source/Utility/Listener.cpp
+++ b/lldb/source/Utility/Listener.cpp
@@ -352,8 +352,7 @@ Listener::StartListeningForEventSpec(const BroadcasterManagerSP &manager_sp,
       this->shared_from_this(), event_spec);
   if (bits_acquired) {
     BroadcasterManagerWP manager_wp(manager_sp);
-    auto iter = llvm::find_if(m_broadcaster_managers, manager_matcher);
-    if (iter == m_broadcaster_managers.end())
+    if (llvm::none_of(m_broadcaster_managers, manager_matcher))
       m_broadcaster_managers.push_back(manager_wp);
   }
 
diff --git a/lldb/test/API/commands/settings/TestSettings.py b/lldb/test/API/commands/settings/TestSettings.py
index 4ac1239..bc86494 100644
--- a/lldb/test/API/commands/settings/TestSettings.py
+++ b/lldb/test/API/commands/settings/TestSettings.py
@@ -168,7 +168,6 @@ class SettingsCommandTestCase(TestBase):
             substrs=["term-width (unsigned) = 60"],
         )
 
-    # rdar://problem/10712130
     def test_set_frame_format(self):
         """Test that 'set frame-format' with a backtick char in the format string works as well as fullpath."""
         self.build()
diff --git a/lldb/test/API/commands/statistics/basic/TestStats.py b/lldb/test/API/commands/statistics/basic/TestStats.py
index a9a7e93..83132b4 100644
--- a/lldb/test/API/commands/statistics/basic/TestStats.py
+++ b/lldb/test/API/commands/statistics/basic/TestStats.py
@@ -1068,3 +1068,89 @@ class TestCase(TestBase):
         all_targets_stats = self.get_stats("--all-targets")
         self.assertIsNotNone(self.find_module_in_metrics(main_exe, all_targets_stats))
         self.assertIsNotNone(self.find_module_in_metrics(second_exe, all_targets_stats))
+
+    # Return some level of the plugin stats hierarchy.
+    # Will return either the top-level node, the namespace node, or a specific
+    # plugin node based on requested values.
+    #
+    # If any of the requested keys are not found in the stats then return None.
+    #
+    # Plugin stats look like this:
+    #
+    # "plugins": {
+    #     "system-runtime": [
+    #         {
+    #             "enabled": true,
+    #             "name": "systemruntime-macosx"
+    #         }
+    #     ]
+    # },
+    def get_plugin_stats(self, debugger_stats, plugin_namespace=None, plugin_name=None):
+        # Get top level plugin stats.
+        if "plugins" not in debugger_stats:
+            return None
+        plugins = debugger_stats["plugins"]
+        if not plugin_namespace:
+            return plugins
+
+        # Plugin namespace stats.
+        if plugin_namespace not in plugins:
+            return None
+        plugins_for_namespace = plugins[plugin_namespace]
+        if not plugin_name:
+            return plugins_for_namespace
+
+        # Specific plugin stats.
+        for plugin in debugger_stats["plugins"][plugin_namespace]:
+            if plugin["name"] == plugin_name:
+                return plugin
+        return None
+
+    def test_plugin_stats(self):
+        """
+        Test "statistics dump" contains plugin info.
+        """
+        self.build()
+        exe = self.getBuildArtifact("a.out")
+        target = self.createTestTarget(file_path=exe)
+        debugger_stats = self.get_stats()
+
+        # Verify that the statistics dump contains the plugin information.
+        plugins = self.get_plugin_stats(debugger_stats)
+        self.assertIsNotNone(plugins)
+
+        # Check for a known plugin namespace that should be in the stats.
+        system_runtime_plugins = self.get_plugin_stats(debugger_stats, "system-runtime")
+        self.assertIsNotNone(system_runtime_plugins)
+
+        # Validate the keys exists for the bottom-level plugin stats.
+        plugin_keys_exist = [
+            "name",
+            "enabled",
+        ]
+        for plugin in system_runtime_plugins:
+            self.verify_keys(
+                plugin, 'debugger_stats["plugins"]["system-runtime"]', plugin_keys_exist
+            )
+
+        # Check for a known plugin that is enabled by default.
+        system_runtime_macosx_plugin = self.get_plugin_stats(
+            debugger_stats, "system-runtime", "systemruntime-macosx"
+        )
+        self.assertIsNotNone(system_runtime_macosx_plugin)
+        self.assertTrue(system_runtime_macosx_plugin["enabled"])
+
+        # Now disable the plugin and check the stats again.
+        # The stats should show the plugin is disabled.
+        self.runCmd("plugin disable system-runtime.systemruntime-macosx")
+        debugger_stats = self.get_stats()
+        system_runtime_macosx_plugin = self.get_plugin_stats(
+            debugger_stats, "system-runtime", "systemruntime-macosx"
+        )
+        self.assertIsNotNone(system_runtime_macosx_plugin)
+        self.assertFalse(system_runtime_macosx_plugin["enabled"])
+
+        # Plugins should not show up in the stats when disabled with an option.
+        debugger_stats = self.get_stats("--plugins false")
+        plugins = self.get_plugin_stats(debugger_stats)
+        self.assertIsNone(plugins)
diff --git a/lldb/test/API/lang/c/forward/TestForwardDeclaration.py b/lldb/test/API/lang/c/forward/TestForwardDeclaration.py
index 5d05f25..b6b6f1d 100644
--- a/lldb/test/API/lang/c/forward/TestForwardDeclaration.py
+++ b/lldb/test/API/lang/c/forward/TestForwardDeclaration.py
@@ -53,7 +53,6 @@ class ForwardDeclarationTestCase(TestBase):
     @skipIfDarwin
     @skipIf(compiler=no_match("clang"))
     @skipIf(compiler_version=["<", "8.0"])
-    @expectedFailureAll(oslist=["windows"])
     def test_debug_names(self):
         """Test that we are able to find complete types when using DWARF v5
         accelerator tables"""
diff --git a/lldb/test/API/lang/cpp/forward/TestCPPForwardDeclaration.py b/lldb/test/API/lang/cpp/forward/TestCPPForwardDeclaration.py
index 5e9dd9c..e66ab4f 100644
--- a/lldb/test/API/lang/cpp/forward/TestCPPForwardDeclaration.py
+++ b/lldb/test/API/lang/cpp/forward/TestCPPForwardDeclaration.py
@@ -47,7 +47,6 @@ class ForwardDeclarationTestCase(TestBase):
     @skipIfDarwin
     @skipIf(compiler=no_match("clang"))
     @skipIf(compiler_version=["<", "8.0"])
-    @expectedFailureAll(oslist=["windows"])
     def test_debug_names(self):
         """Test that we are able to find complete types when using DWARF v5
         accelerator tables"""
diff --git a/lldb/test/Shell/Commands/command-plugin-enable+disable.test b/lldb/test/Shell/Commands/command-plugin-enable+disable.test
new file mode 100644
index 0000000..af646e6
--- /dev/null
+++ b/lldb/test/Shell/Commands/command-plugin-enable+disable.test
@@ -0,0 +1,87 @@
+# This test validates the plugin enable and disable commands.
+# Currently it works only for system-runtime plugins and we only have one
+# system runtime plugin so testing is a bit limited.
+#
+# Note that commands that return errors will stop running a script, so we
+# have new RUN lines for any command that is expected to return an error.
+
+# RUN: %lldb -s %s -o exit 2>&1 | FileCheck %s
+
+# Test plugin list shows the default state which is expected to be enabled.
+plugin list
+# CHECK-LABEL: plugin list
+# CHECK: system-runtime
+# CHECK:  [+] systemruntime-macosx           System runtime plugin for Mac OS X native libraries
+
+# Test plugin disable disables a plugin.
+plugin disable system-runtime.systemruntime-macosx
+# CHECK-LABEL: plugin disable system-runtime.systemruntime-macosx
+# CHECK: system-runtime
+# CHECK:  [-] systemruntime-macosx           System runtime plugin for Mac OS X native libraries
+
+# Make sure plugin list shows it disabled as well.
+plugin list
+# CHECK: system-runtime
+# CHECK:  [-] systemruntime-macosx           System runtime plugin for Mac OS X native libraries
+
+# Test plugin enable re-enables a plugin.
+plugin enable system-runtime.systemruntime-macosx
+# CHECK-LABEL: plugin enable system-runtime.systemruntime-macosx
+# CHECK: system-runtime
+# CHECK:  [+] systemruntime-macosx           System runtime plugin for Mac OS X native libraries
+
+# Make sure plugin list shows it enabled as well.
+plugin list
+# CHECK: system-runtime
+# CHECK:  [+] systemruntime-macosx           System runtime plugin for Mac OS X native libraries
+
+# Test plugin disable with namespace works.
+plugin disable system-runtime
+# CHECK-LABEL: plugin disable system-runtime
+# CHECK: system-runtime
+# CHECK:  [-] systemruntime-macosx           System runtime plugin for Mac OS X native libraries
+
+# Test plugin enable with namespace works.
+plugin enable system-runtime
+# CHECK-LABEL: plugin enable system-runtime
+# CHECK: system-runtime
+# CHECK:  [+] systemruntime-macosx           System runtime plugin for Mac OS X native libraries
+
+# Test plugin enable/disable for instrumentation plugin works.
+plugin enable instrumentation-runtime
+# CHECK-LABEL: plugin enable instrumentation-runtime
+# CHECK: instrumentation-runtime
+# CHECK:  [+] AddressSanitizer
+plugin disable instrumentation-runtime
+# CHECK-LABEL: plugin disable instrumentation-runtime
+# CHECK: instrumentation-runtime
+# CHECK:  [-] AddressSanitizer
+
+# Test plugin enable with multiple arguments.
+plugin enable system-runtime instrumentation-runtime
+# CHECK-LABEL: plugin enable system-runtime instrumentation-runtime
+# CHECK: system-runtime
+# CHECK:   [+] systemruntime-macosx           System runtime plugin for Mac OS X native libraries.
+# CHECK: instrumentation-runtime
+# CHECK:   [+] AddressSanitizer               AddressSanitizer instrumentation runtime plugin.
+
+# Test plugin disable with multiple arguments.
+plugin disable system-runtime instrumentation-runtime
+# CHECK-LABEL: plugin disable system-runtime instrumentation-runtime
+# CHECK: system-runtime
+# CHECK:   [-] systemruntime-macosx           System runtime plugin for Mac OS X native libraries.
+# CHECK: instrumentation-runtime
+# CHECK:   [-] AddressSanitizer               AddressSanitizer instrumentation runtime plugin.
+
+# Test plugin enable/disable for unknown plugin returns an error.
+# RUN: %lldb -o "plugin enable some-plugin-that-does-not-exist" 2>&1 | FileCheck %s --check-prefix=ERROR_PLUGIN_NOT_FOUND
+# RUN: %lldb -o "plugin disable some-plugin-that-does-not-exist" 2>&1 | FileCheck %s --check-prefix=ERROR_PLUGIN_NOT_FOUND
+# RUN: %lldb -o "plugin enable system-runtime some-plugin-that-does-not-exist" 2>&1 | FileCheck %s --check-prefix=ERROR_PLUGIN_NOT_FOUND
+# ERROR_PLUGIN_NOT_FOUND: error: Found no matching plugins
+
+# Test plugin enable/disable requires a plugin name.
+# RUN: %lldb -o "plugin enable" 2>&1 | FileCheck %s --check-prefix=ERROR_ARGUMENTS_ENABLE
+# ERROR_ARGUMENTS_ENABLE: error: 'plugin enable' requires one or more arguments
+
+# RUN: %lldb -o "plugin disable" 2>&1 | FileCheck %s --check-prefix=ERROR_ARGUMENTS_DISABLE
+# ERROR_ARGUMENTS_DISABLE: error: 'plugin disable' requires one or more arguments
diff --git a/lldb/test/Shell/Commands/command-plugin-list.test b/lldb/test/Shell/Commands/command-plugin-list.test
new file mode 100644
index 0000000..9d3680d
--- /dev/null
+++ b/lldb/test/Shell/Commands/command-plugin-list.test
@@ -0,0 +1,82 @@
+# This test validates the plugin list command.
+# Currently it works only for system-runtime plugins and we only have one
+# system runtime plugin so testing is a bit limited.
+#
+# Note that commands that return errors will stop running a script, so we
+# have new RUN lines for any command that is expected to return an error.
+
+# RUN: %lldb -s %s -o exit 2>&1 | FileCheck %s
+
+# Test plugin list without an argument will list all plugins.
+plugin list
+# CHECK-LABEL: plugin list
+# CHECK: system-runtime
+# CHECK:  [+] systemruntime-macosx           System runtime plugin for Mac OS X native libraries
+# CHECK: instrumentation-runtime
+# CHECK:  [+] AddressSanitizer               AddressSanitizer instrumentation runtime plugin.
+
+# Test plugin list works with fully qualified name.
+plugin list system-runtime.systemruntime-macosx
+# CHECK-LABEL: plugin list system-runtime.systemruntime-macosx
+# CHECK: system-runtime
+# CHECK:  [+] systemruntime-macosx           System runtime plugin for Mac OS X native libraries
+
+# Test plugin list on plugin namespace works.
+plugin list system-runtime
+# CHECK-LABEL: plugin list system-runtime
+# CHECK: system-runtime
+# CHECK:  [+] systemruntime-macosx           System runtime plugin for Mac OS X native libraries
+
+# Test plugin list on multiple args works.
+plugin list system-runtime instrumentation-runtime.AddressSanitizer
+# CHECK-LABEL: plugin list system-runtime instrumentation-runtime.AddressSanitizer
+# CHECK: system-runtime
+# CHECK:  [+] systemruntime-macosx           System runtime plugin for Mac OS X native libraries
+# CHECK: instrumentation-runtime
+# CHECK:  [+] AddressSanitizer               AddressSanitizer instrumentation runtime plugin.
+
+# Test json output for plugin list.
+plugin list --json
+# CHECK-LABEL plugin list --json
+# CHECK: {
+# CHECK-DAG:  "instrumentation-runtime":
+# CHECK-DAG:  "system-runtime":
+# CHECK: }
+
+# Test json output for plugin list with a namespace
+plugin list system-runtime --json
+# CHECK-LABEL plugin list --json
+# CHECK: {
+# CHECK:  "system-runtime": [
+# CHECK:    {
+# CHECK-DAG:  "enabled": true
+# CHECK-DAG:  "name": "systemruntime-macosx"
+# CHECK:    }
+# CHECK:  ]
+# CHECK: }
+
+# Test json output for listing multiple plugins
+plugin list --json system-runtime instrumentation-runtime.AddressSanitizer
+# CHECK-LABEL plugin list --json system-runtime instrumentation-runtime.AddressSanitizer
+# CHECK: {
+# CHECK-DAG:  "instrumentation-runtime":
+# CHECK-DAG:    "name": "AddressSanitizer"
+# CHECK-DAG:  "system-runtime":
+# CHECK: }
+
+
+# Test plugin list does not match a plugin name by substring.
+# RUN: %lldb -o "plugin list macosx" 2>&1 | FileCheck %s --check-prefix=ERROR_PLUGIN_NOT_FOUND
+
+# Test plugin list does not match a plugin namespace by substring.
+# RUN: %lldb -o "plugin list system-runtime." 2>&1 | FileCheck %s --check-prefix=ERROR_PLUGIN_NOT_FOUND
+
+# Test plugin list returns an error for unknown second argument
+# RUN: %lldb -o "plugin list system-runtime foo" 2>&1 | FileCheck %s --check-prefix=ERROR_PLUGIN_NOT_FOUND
+
+# Test plugin list returns an error for unknown second argument
+# RUN: %lldb -o "plugin list --json system-runtime foo" 2>&1 | FileCheck %s --check-prefix=ERROR_PLUGIN_NOT_FOUND
+
+# Test plugin list for unknown plugin returns an error.
+# RUN: %lldb -o "plugin list some-plugin-that-does-not-exist" 2>&1 | FileCheck %s --check-prefix=ERROR_PLUGIN_NOT_FOUND
+# ERROR_PLUGIN_NOT_FOUND: error: Found no matching plugins
diff --git a/lldb/test/Shell/ObjectFile/PECOFF/dwarf-clang.yaml b/lldb/test/Shell/ObjectFile/PECOFF/dwarf-clang.yaml
new file mode 100644
index 0000000..1adc615
--- /dev/null
+++ b/lldb/test/Shell/ObjectFile/PECOFF/dwarf-clang.yaml
@@ -0,0 +1,151 @@
+# Test that LLDB can read executables with DWARF sections generated by Clang
+
+# RUN: yaml2obj %s -o %t
+# RUN: lldb-test object-file %t | FileCheck %s
+
+# CHECK: Name: .debug_abbrev
+# CHECK-NEXT: Type: dwarf-abbrev
+
+# CHECK: Name: .debug_addr
+# CHECK-NEXT: Type: dwarf-addr
+
+# CHECK: Name: .debug_aranges
+# CHECK-NEXT: Type: dwarf-aranges
+
+# CHECK: Name: .debug_info
+# CHECK-NEXT: Type: dwarf-info
+
+# CHECK: Name: .debug_line
+# CHECK-NEXT: Type: dwarf-line
+
+# CHECK: Name: .debug_line_str
+# CHECK-NEXT: Type: dwarf-line-str
+
+# CHECK: Name: .debug_rnglists
+# CHECK-NEXT: Type: dwarf-rnglists
+
+# CHECK: Name: .debug_str
+# CHECK-NEXT: Type: dwarf-str
+
+# CHECK: Name: .debug_str_offsets
+# CHECK-NEXT: Type: dwarf-str-offsets
+
+--- !COFF
+OptionalHeader:
+  AddressOfEntryPoint: 4956
+  ImageBase:       5368709120
+  SectionAlignment: 4096
+  FileAlignment:   512
+  MajorOperatingSystemVersion: 6
+  MinorOperatingSystemVersion: 0
+  MajorImageVersion: 0
+  MinorImageVersion: 0
+  MajorSubsystemVersion: 6
+  MinorSubsystemVersion: 0
+  Subsystem:       IMAGE_SUBSYSTEM_WINDOWS_CUI
+  DLLCharacteristics: [ IMAGE_DLL_CHARACTERISTICS_HIGH_ENTROPY_VA, IMAGE_DLL_CHARACTERISTICS_DYNAMIC_BASE, IMAGE_DLL_CHARACTERISTICS_NX_COMPAT, IMAGE_DLL_CHARACTERISTICS_TERMINAL_SERVER_AWARE ]
+  SizeOfStackReserve: 1048576
+  SizeOfStackCommit: 4096
+  SizeOfHeapReserve: 1048576
+  SizeOfHeapCommit: 4096
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: [ IMAGE_FILE_EXECUTABLE_IMAGE, IMAGE_FILE_LARGE_ADDRESS_AWARE ]
+sections:
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  4096
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .rdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  401408
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .data
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    VirtualAddress:  479232
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .pdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  491520
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .fptable
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    VirtualAddress:  512000
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            _RDATA
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  516096
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            _guard_c
+    Characteristics: [  ]
+    VirtualAddress:  520192
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            _guard_d
+    Characteristics: [  ]
+    VirtualAddress:  524288
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            memcpy_
+    Characteristics: [  ]
+    VirtualAddress:  528384
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .reloc
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  532480
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .debug_abbrev
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  536576
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .debug_addr
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  540672
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .debug_aranges
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  544768
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .debug_info
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  548864
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .debug_line
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  552960
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .debug_line_str
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  557056
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .debug_rnglists
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  561152
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .debug_str
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  565248
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .debug_str_offsets
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  569344
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+symbols:         []
+...
diff --git a/lldb/test/Shell/ObjectFile/PECOFF/dwarf-gcc-mingw.yaml b/lldb/test/Shell/ObjectFile/PECOFF/dwarf-gcc-mingw.yaml
new file mode 100644
index 0000000..7f65637
--- /dev/null
+++ b/lldb/test/Shell/ObjectFile/PECOFF/dwarf-gcc-mingw.yaml
@@ -0,0 +1,151 @@
+# Test that LLDB can read executables with DWARF sections generated by GCC on MinGW
+
+# RUN: yaml2obj %s -o %t
+# RUN: lldb-test object-file %t | FileCheck %s
+
+# CHECK: Name: .debug_aranges
+# CHECK-NEXT: Type: dwarf-aranges
+
+# CHECK: Name: .debug_info
+# CHECK-NEXT: Type: dwarf-info
+
+# CHECK: Name: .debug_abbrev
+# CHECK-NEXT: Type: dwarf-abbrev
+
+# CHECK: Name: .debug_line
+# CHECK-NEXT: Type: dwarf-line
+
+# CHECK: Name: .debug_frame
+# CHECK-NEXT: Type: dwarf-frame
+
+# CHECK: Name: .debug_str
+# CHECK-NEXT: Type: dwarf-str
+
+# CHECK: Name: .debug_line_str
+# CHECK-NEXT: Type: dwarf-line-str
+
+# CHECK: Name: .debug_loclists
+# CHECK-NEXT: Type: dwarf-loclists
+
+# CHECK: Name: .debug_rnglists
+# CHECK-NEXT: Type: dwarf-rnglists
+
+--- !COFF
+OptionalHeader:
+  AddressOfEntryPoint: 5136
+  ImageBase:       5368709120
+  SectionAlignment: 4096
+  FileAlignment:   512
+  MajorOperatingSystemVersion: 4
+  MinorOperatingSystemVersion: 0
+  MajorImageVersion: 0
+  MinorImageVersion: 0
+  MajorSubsystemVersion: 5
+  MinorSubsystemVersion: 2
+  Subsystem:       IMAGE_SUBSYSTEM_WINDOWS_CUI
+  DLLCharacteristics: [ IMAGE_DLL_CHARACTERISTICS_HIGH_ENTROPY_VA, IMAGE_DLL_CHARACTERISTICS_DYNAMIC_BASE, IMAGE_DLL_CHARACTERISTICS_NX_COMPAT ]
+  SizeOfStackReserve: 2097152
+  SizeOfStackCommit: 4096
+  SizeOfHeapReserve: 1048576
+  SizeOfHeapCommit: 4096
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: [ IMAGE_FILE_EXECUTABLE_IMAGE, IMAGE_FILE_LINE_NUMS_STRIPPED, IMAGE_FILE_LARGE_ADDRESS_AWARE ]
+sections:
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  4096
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .data
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    VirtualAddress:  12288
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .rdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  16384
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .pdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  20480
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .xdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  24576
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .bss
+    Characteristics: [ IMAGE_SCN_CNT_UNINITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    VirtualAddress:  28672
+    VirtualSize:     384
+    SectionData:     ''
+  - Name:            .idata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  32768
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .tls
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    VirtualAddress:  36864
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .rsrc
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  40960
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .reloc
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  45056
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .debug_aranges
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  49152
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .debug_info
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  53248
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .debug_abbrev
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  98304
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .debug_line
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  106496
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .debug_frame
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  114688
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .debug_str
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  118784
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .debug_line_str
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  122880
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .debug_loclists
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  131072
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+  - Name:            .debug_rnglists
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  139264
+    VirtualSize:     64
+    SectionData:     DEADBEEFBAADF00D
+symbols:         []
+...
diff --git a/lldb/test/Shell/lit.cfg.py b/lldb/test/Shell/lit.cfg.py
index 1936396..ab61137 100644
--- a/lldb/test/Shell/lit.cfg.py
+++ b/lldb/test/Shell/lit.cfg.py
@@ -198,3 +198,8 @@ if platform.system() == "Darwin":
             config.available_features.add("ld_new-bug")
     except:
         pass
+
+# Some shell tests dynamically link with python.dll and need to know the
+# location of the Python libraries. This ensures that we use the same
+# version of Python that was used to build lldb to run our tests.
+config.environment["PYTHONHOME"] = config.python_root_dir
diff --git a/lldb/test/Shell/lit.site.cfg.py.in b/lldb/test/Shell/lit.site.cfg.py.in
index 7e03938..5be5359 100644
--- a/lldb/test/Shell/lit.site.cfg.py.in
+++ b/lldb/test/Shell/lit.site.cfg.py.in
@@ -21,6 +21,7 @@ config.enable_remote = not @LLDB_TEST_SHELL_DISABLE_REMOTE@
 config.libcxx_libs_dir = "@LIBCXX_LIBRARY_DIR@"
 config.target_triple = "@LLVM_TARGET_TRIPLE@"
 config.python_executable = "@Python3_EXECUTABLE@"
+config.python_root_dir = "@Python3_ROOT_DIR@"
 config.have_zlib = @LLVM_ENABLE_ZLIB@
 config.objc_gnustep_dir = "@LLDB_TEST_OBJC_GNUSTEP_DIR@"
 config.lldb_enable_lzma = @LLDB_ENABLE_LZMA@
diff --git a/lldb/test/Unit/lit.cfg.py b/lldb/test/Unit/lit.cfg.py
index 8d711f1..681e3b1 100644
--- a/lldb/test/Unit/lit.cfg.py
+++ b/lldb/test/Unit/lit.cfg.py
@@ -33,6 +33,7 @@ llvm_config.with_system_environment(
     ]
 )
 llvm_config.with_environment("PATH", os.path.dirname(sys.executable), append_path=True)
+config.environment["PYTHONHOME"] = config.python_root_dir
 
 # Enable sanitizer runtime flags.
 if config.llvm_use_sanitizer:
diff --git a/lldb/test/Unit/lit.site.cfg.py.in b/lldb/test/Unit/lit.site.cfg.py.in
index 2748be2..fb94797 100644
--- a/lldb/test/Unit/lit.site.cfg.py.in
+++ b/lldb/test/Unit/lit.site.cfg.py.in
@@ -11,6 +11,7 @@ config.lldb_src_root = "@LLDB_SOURCE_DIR@"
 config.target_triple = "@LLVM_TARGET_TRIPLE@"
 config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
 config.python_executable = "@Python3_EXECUTABLE@"
+config.python_root_dir = "@Python3_ROOT_DIR@"
 
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
diff --git a/lldb/unittests/Core/PluginManagerTest.cpp b/lldb/unittests/Core/PluginManagerTest.cpp
index 9b0ce22..c7a56b2 100644
--- a/lldb/unittests/Core/PluginManagerTest.cpp
+++ b/lldb/unittests/Core/PluginManagerTest.cpp
@@ -379,3 +379,93 @@ TEST_F(PluginManagerTest, UnRegisterSystemRuntimePluginChangesOrder) {
   ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(2),
             CreateSystemRuntimePluginB);
 }
+
+TEST_F(PluginManagerTest, MatchPluginName) {
+  PluginNamespace Foo{"foo", nullptr, nullptr};
+  RegisteredPluginInfo Bar{"bar", "bar plugin ", true};
+  RegisteredPluginInfo Baz{"baz", "baz plugin ", true};
+
+  // Empty pattern matches everything.
+  ASSERT_TRUE(PluginManager::MatchPluginName("", Foo, Bar));
+
+  // Plugin namespace matches all plugins in that namespace.
+  ASSERT_TRUE(PluginManager::MatchPluginName("foo", Foo, Bar));
+  ASSERT_TRUE(PluginManager::MatchPluginName("foo", Foo, Baz));
+
+  // Fully qualified plugin name matches only that plugin.
+  ASSERT_TRUE(PluginManager::MatchPluginName("foo.bar", Foo, Bar));
+  ASSERT_FALSE(PluginManager::MatchPluginName("foo.baz", Foo, Bar));
+
+  // Prefix match should not match.
+  ASSERT_FALSE(PluginManager::MatchPluginName("f", Foo, Bar));
+  ASSERT_FALSE(PluginManager::MatchPluginName("foo.", Foo, Bar));
+  ASSERT_FALSE(PluginManager::MatchPluginName("foo.ba", Foo, Bar));
+}
+
+TEST_F(PluginManagerTest, JsonFormat) {
+  RegisterMockSystemRuntimePlugins();
+
+  // We expect the following JSON output:
+  // {
+  //   "system-runtime": [
+  //     {
+  //       "enabled": true,
+  //       "name": "a"
+  //     },
+  //     {
+  //       "enabled": true,
+  //       "name": "b"
+  //     },
+  //     {
+  //       "enabled": true,
+  //       "name": "c"
+  //     }
+  //   ]
+  // }
+  llvm::json::Object obj = PluginManager::GetJSON();
+
+  // We should have a "system-runtime" array in the top-level object.
+  llvm::json::Array *maybe_array = obj.getArray("system-runtime");
+  ASSERT_TRUE(maybe_array != nullptr);
+  auto &array = *maybe_array;
+  ASSERT_EQ(array.size(), 3u);
+
+  // Check plugin "a" info.
+  ASSERT_TRUE(array[0].getAsObject() != nullptr);
+  ASSERT_TRUE(array[0].getAsObject()->getString("name") == "a");
+  ASSERT_TRUE(array[0].getAsObject()->getBoolean("enabled") == true);
+
+  // Check plugin "b" info.
+  ASSERT_TRUE(array[1].getAsObject() != nullptr);
+  ASSERT_TRUE(array[1].getAsObject()->getString("name") == "b");
+  ASSERT_TRUE(array[1].getAsObject()->getBoolean("enabled") == true);
+
+  // Check plugin "c" info.
+  ASSERT_TRUE(array[2].getAsObject() != nullptr);
+  ASSERT_TRUE(array[2].getAsObject()->getString("name") == "c");
+  ASSERT_TRUE(array[2].getAsObject()->getBoolean("enabled") == true);
+
+  // Disabling a plugin should be reflected in the JSON output.
+  ASSERT_TRUE(PluginManager::SetSystemRuntimePluginEnabled("b", false));
+  array = *PluginManager::GetJSON().getArray("system-runtime");
+  ASSERT_TRUE(array[0].getAsObject()->getBoolean("enabled") == true);
+  ASSERT_TRUE(array[1].getAsObject()->getBoolean("enabled") == false);
+  ASSERT_TRUE(array[2].getAsObject()->getBoolean("enabled") == true);
+
+  // Un-registering a plugin should be reflected in the JSON output.
+  ASSERT_TRUE(PluginManager::UnregisterPlugin(CreateSystemRuntimePluginB));
+  array = *PluginManager::GetJSON().getArray("system-runtime");
+  ASSERT_EQ(array.size(), 2u);
+  ASSERT_TRUE(array[0].getAsObject()->getString("name") == "a");
+  ASSERT_TRUE(array[1].getAsObject()->getString("name") == "c");
+
+  // Filtering the JSON output should only include the matching plugins.
+  array =
+      *PluginManager::GetJSON("system-runtime.c").getArray("system-runtime");
+  ASSERT_EQ(array.size(), 1u);
+  ASSERT_TRUE(array[0].getAsObject()->getString("name") == "c");
+
+  // Empty JSON output is allowed if there are no matching plugins.
+  obj = PluginManager::GetJSON("non-existent-plugin");
+  ASSERT_TRUE(obj.empty());
+}
diff --git a/lldb/unittests/Symbol/PostfixExpressionTest.cpp b/lldb/unittests/Symbol/PostfixExpressionTest.cpp
index d56df47..1e437da 100644
--- a/lldb/unittests/Symbol/PostfixExpressionTest.cpp
+++ b/lldb/unittests/Symbol/PostfixExpressionTest.cpp
@@ -159,8 +159,8 @@ static std::string ParseAndGenerateDWARF(llvm::StringRef expr) {
 
   std::string result;
   llvm::raw_string_ostream os(result);
-  llvm::DWARFExpression(extractor, addr_size, llvm::dwarf::DWARF32)
-      .print(os, llvm::DIDumpOptions(), nullptr);
+  llvm::DWARFExpression E(extractor, addr_size, llvm::dwarf::DWARF32);
+  llvm::DWARFExpressionPrinter::print(&E, os, llvm::DIDumpOptions(), nullptr);
   return result;
 }
 
diff --git a/lldb/unittests/SymbolFile/NativePDB/PdbFPOProgramToDWARFExpressionTests.cpp b/lldb/unittests/SymbolFile/NativePDB/PdbFPOProgramToDWARFExpressionTests.cpp
index efb8f720..d746e04 100644
--- a/lldb/unittests/SymbolFile/NativePDB/PdbFPOProgramToDWARFExpressionTests.cpp
+++ b/lldb/unittests/SymbolFile/NativePDB/PdbFPOProgramToDWARFExpressionTests.cpp
@@ -39,8 +39,8 @@ CheckValidProgramTranslation(llvm::StringRef fpo_program,
 
   std::string result;
   llvm::raw_string_ostream os(result);
-  llvm::DWARFExpression(extractor, /*AddressSize=*/4, llvm::dwarf::DWARF32)
-      .print(os, llvm::DIDumpOptions(), nullptr);
+  llvm::DWARFExpression E(extractor, /*AddressSize=*/4, llvm::dwarf::DWARF32);
+  llvm::DWARFExpressionPrinter::print(&E, os, llvm::DIDumpOptions(), nullptr);
 
   // actual check
   ASSERT_EQ(expected_dwarf_expression, result);
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 0958f6a..78604d0 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -4438,7 +4438,8 @@ the type size is smaller than the type's store size.
       < vscale x <# elements> x <elementtype> > ; Scalable vector
 
 The number of elements is a constant integer value larger than 0;
-elementtype may be any integer, floating-point or pointer type. Vectors
+elementtype may be any integer, floating-point, pointer type, or a sized  
+target extension type that has the ``CanBeVectorElement`` property. Vectors
 of size zero are not allowed. For scalable vectors, the total number of
 elements is a constant multiple (called vscale) of the specified number
 of elements; vscale is a positive integer that is unknown at compile time
diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h
index 897dc76..eea06cf 100644
--- a/llvm/include/llvm/ADT/STLExtras.h
+++ b/llvm/include/llvm/ADT/STLExtras.h
@@ -1940,6 +1940,28 @@ template <typename R> bool is_sorted(R &&Range) {
   return std::is_sorted(adl_begin(Range), adl_end(Range));
 }
 
+/// Provide wrappers to std::includes which take ranges instead of having to
+/// pass begin/end explicitly.
+/// This function checks if the sorted range \p R2 is a subsequence of the
+/// sorted range \p R1. The ranges must be sorted in non-descending order.
+template <typename R1, typename R2> bool includes(R1 &&Range1, R2 &&Range2) {
+  assert(is_sorted(Range1) && "Range1 must be sorted in non-descending order");
+  assert(is_sorted(Range2) && "Range2 must be sorted in non-descending order");
+  return std::includes(adl_begin(Range1), adl_end(Range1), adl_begin(Range2),
+                       adl_end(Range2));
+}
+
+/// This function checks if the sorted range \p R2 is a subsequence of the
+/// sorted range \p R1. The ranges must be sorted with respect to a comparator
+/// \p C.
+template <typename R1, typename R2, typename Compare>
+bool includes(R1 &&Range1, R2 &&Range2, Compare &&C) {
+  assert(is_sorted(Range1, C) && "Range1 must be sorted with respect to C");
+  assert(is_sorted(Range2, C) && "Range2 must be sorted with respect to C");
+  return std::includes(adl_begin(Range1), adl_end(Range1), adl_begin(Range2),
+                       adl_end(Range2), std::forward<Compare>(C));
+}
+
 /// Wrapper function around std::count to count the number of times an element
 /// \p Element occurs in the given range \p Range.
 template <typename R, typename E> auto count(R &&Range, const E &Element) {
diff --git a/llvm/include/llvm/ADT/bit.h b/llvm/include/llvm/ADT/bit.h
index b952825..d6e33c3 100644
--- a/llvm/include/llvm/ADT/bit.h
+++ b/llvm/include/llvm/ADT/bit.h
@@ -300,11 +300,12 @@ template <typename T> [[nodiscard]] T bit_ceil(T Value) {
   return T(1) << llvm::bit_width<T>(Value - 1u);
 }
 
-namespace detail {
-template <typename T, std::size_t SizeOfT> struct PopulationCounter {
-  static int count(T Value) {
-    // Generic version, forward to 32 bits.
-    static_assert(SizeOfT <= 4, "Not implemented!");
+/// Count the number of set bits in a value.
+/// Ex. popcount(0xF000F000) = 8
+/// Returns 0 if the word is zero.
+template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
+[[nodiscard]] inline int popcount(T Value) noexcept {
+  if constexpr (sizeof(T) <= 4) {
 #if defined(__GNUC__)
     return (int)__builtin_popcount(Value);
 #else
@@ -313,11 +314,7 @@ template <typename T, std::size_t SizeOfT> struct PopulationCounter {
     v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
     return int(((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24);
 #endif
-  }
-};
-
-template <typename T> struct PopulationCounter<T, 8> {
-  static int count(T Value) {
+  } else if constexpr (sizeof(T) <= 8) {
 #if defined(__GNUC__)
     return (int)__builtin_popcountll(Value);
 #else
@@ -327,16 +324,9 @@ template <typename T> struct PopulationCounter<T, 8> {
     v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
     return int((uint64_t)(v * 0x0101010101010101ULL) >> 56);
 #endif
+  } else {
+    static_assert(sizeof(T) == 0, "T must be 8 bytes or less");
   }
-};
-} // namespace detail
-
-/// Count the number of set bits in a value.
-/// Ex. popcount(0xF000F000) = 8
-/// Returns 0 if the word is zero.
-template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
-[[nodiscard]] inline int popcount(T Value) noexcept {
-  return detail::PopulationCounter<T, sizeof(T)>::count(Value);
 }
 
 // Forward-declare rotr so that rotl can use it.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h
index 0007971..3a25093 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h
@@ -161,6 +161,12 @@ enum {
   /// - Pred(2) - The predicate to test
   GIM_CheckImmOperandPredicate,
 
+  /// Check a leaf predicate on the specified instruction.
+  /// - InsnID(ULEB128) - Instruction ID
+  /// - OpIdx(ULEB128) - Operand index
+  /// - Pred(2) - The predicate to test
+  GIM_CheckLeafOperandPredicate,
+
   /// Check a memory operation has the specified atomic ordering.
   /// - InsnID(ULEB128) - Instruction ID
   /// - Ordering(ULEB128) - The AtomicOrdering value
@@ -707,6 +713,12 @@ protected:
         "Subclasses must override this with a tablegen-erated function");
   }
 
+  virtual bool testMOPredicate_MO(unsigned, const MachineOperand &,
+                                  const MatcherState &State) const {
+    llvm_unreachable(
+        "Subclasses must override this with a tablegen-erated function");
+  }
+
   virtual bool testSimplePredicate(unsigned) const {
     llvm_unreachable("Subclass does not implement testSimplePredicate!");
   }
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
index 6c4f036..591cf9c 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
@@ -410,6 +410,26 @@ bool GIMatchTableExecutor::executeMatchTable(
           return false;
       break;
     }
+    case GIM_CheckLeafOperandPredicate: {
+      uint64_t InsnID = readULEB();
+      uint64_t OpIdx = readULEB();
+      uint16_t Predicate = readU16();
+      DEBUG_WITH_TYPE(TgtExecutor::getName(),
+                      dbgs() << CurrentIdx
+                             << ": GIM_CheckLeafOperandPredicate(MIs[" << InsnID
+                             << "]->getOperand(" << OpIdx
+                             << "), Predicate=" << Predicate << ")\n");
+      assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
+      assert(State.MIs[InsnID]->getOperand(OpIdx).isReg() &&
+             "Expected register operand");
+      assert(Predicate > GICXXPred_Invalid && "Expected a valid predicate");
+      MachineOperand &MO = State.MIs[InsnID]->getOperand(OpIdx);
+
+      if (!testMOPredicate_MO(Predicate, MO, State))
+        if (handleReject() == RejectAndGiveUp)
+          return false;
+      break;
+    }
     case GIM_CheckIsBuildVectorAllOnes:
     case GIM_CheckIsBuildVectorAllZeros: {
       uint64_t InsnID = readULEB();
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 47a1aec..465e4a0 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1493,8 +1493,9 @@ enum NodeType {
   VECREDUCE_UMIN,
 
   // PARTIAL_REDUCE_[U|S]MLA(Accumulator, Input1, Input2)
-  // The partial reduction nodes sign or zero extend Input1 and Input2 to the
-  // element type of Accumulator before multiplying their results.
+  // The partial reduction nodes sign or zero extend Input1 and Input2
+  // (with the extension kind noted below) to the element type of
+  // Accumulator before multiplying their results.
   // This result is concatenated to the Accumulator, and this is then reduced,
   // using addition, to the result type.
   // The output is only expected to either be given to another partial reduction
@@ -1506,8 +1507,9 @@ enum NodeType {
   // multiple of the number of elements in the Accumulator / output type.
   // Input1 and Input2 must have an element type which is the same as or smaller
   // than the element type of the Accumulator and output.
-  PARTIAL_REDUCE_SMLA,
-  PARTIAL_REDUCE_UMLA,
+  PARTIAL_REDUCE_SMLA,  // sext, sext
+  PARTIAL_REDUCE_UMLA,  // zext, zext
+  PARTIAL_REDUCE_SUMLA, // sext, zext
 
   // The `llvm.experimental.stackmap` intrinsic.
   // Operands: input chain, glue, <id>, <numShadowBytes>, [live0[, live1...]]
diff --git a/llvm/include/llvm/CodeGen/ScoreboardHazardRecognizer.h b/llvm/include/llvm/CodeGen/ScoreboardHazardRecognizer.h
index a70acca..727cd9a 100644
--- a/llvm/include/llvm/CodeGen/ScoreboardHazardRecognizer.h
+++ b/llvm/include/llvm/CodeGen/ScoreboardHazardRecognizer.h
@@ -59,7 +59,7 @@ class ScoreboardHazardRecognizer : public ScheduleHazardRecognizer {
 
     InstrStage::FuncUnits& operator[](size_t idx) const {
       // Depth is expected to be a power-of-2.
-      assert(Depth && !(Depth & (Depth - 1)) &&
+      assert(llvm::has_single_bit(Depth) &&
              "Scoreboard was not initialized properly!");
 
       return Data[(Head + idx) & (Depth-1)];
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 9c453f5..04bc0e9 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1661,7 +1661,8 @@ public:
   /// target has a custom expander for it.
   LegalizeAction getPartialReduceMLAAction(unsigned Opc, EVT AccVT,
                                            EVT InputVT) const {
-    assert(Opc == ISD::PARTIAL_REDUCE_SMLA || Opc == ISD::PARTIAL_REDUCE_UMLA);
+    assert(Opc == ISD::PARTIAL_REDUCE_SMLA || Opc == ISD::PARTIAL_REDUCE_UMLA ||
+           Opc == ISD::PARTIAL_REDUCE_SUMLA);
     PartialReduceActionTypes Key = {Opc, AccVT.getSimpleVT().SimpleTy,
                                     InputVT.getSimpleVT().SimpleTy};
     auto It = PartialReduceMLAActions.find(Key);
@@ -2759,7 +2760,8 @@ protected:
   /// sequence, or the target has a custom expander for it.
   void setPartialReduceMLAAction(unsigned Opc, MVT AccVT, MVT InputVT,
                                  LegalizeAction Action) {
-    assert(Opc == ISD::PARTIAL_REDUCE_SMLA || Opc == ISD::PARTIAL_REDUCE_UMLA);
+    assert(Opc == ISD::PARTIAL_REDUCE_SMLA || Opc == ISD::PARTIAL_REDUCE_UMLA ||
+           Opc == ISD::PARTIAL_REDUCE_SUMLA);
     assert(AccVT.isValid() && InputVT.isValid() &&
            "setPartialReduceMLAAction types aren't valid");
     PartialReduceActionTypes Key = {Opc, AccVT.SimpleTy, InputVT.SimpleTy};
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index 1e96323..4551e7e 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -155,194 +155,195 @@ def v256i32  : VTVec<256,  i32, 75>;  //  256 x i32 vector value
 def v512i32  : VTVec<512,  i32, 76>;  //  512 x i32 vector value
 def v1024i32 : VTVec<1024, i32, 77>;  // 1024 x i32 vector value
 def v2048i32 : VTVec<2048, i32, 78>;  // 2048 x i32 vector value
-
-def v1i64   : VTVec<1,   i64, 79>;  //   1 x i64 vector value
-def v2i64   : VTVec<2,   i64, 80>;  //   2 x i64 vector value
-def v3i64   : VTVec<3,   i64, 81>;  //   3 x i64 vector value
-def v4i64   : VTVec<4,   i64, 82>;  //   4 x i64 vector value
-def v8i64   : VTVec<8,   i64, 83>;  //   8 x i64 vector value
-def v16i64  : VTVec<16,  i64, 84>;  //  16 x i64 vector value
-def v32i64  : VTVec<32,  i64, 85>;  //  32 x i64 vector value
-def v64i64  : VTVec<64,  i64, 86>;  //  64 x i64 vector value
-def v128i64 : VTVec<128, i64, 87>;  // 128 x i64 vector value
-def v256i64 : VTVec<256, i64, 88>;  // 256 x i64 vector value
-
-def v1i128  : VTVec<1,  i128, 89>;  //  1 x i128 vector value
-
-def v1f16    : VTVec<1,    f16,  90>;  //    1 x f16 vector value
-def v2f16    : VTVec<2,    f16,  91>;  //    2 x f16 vector value
-def v3f16    : VTVec<3,    f16,  92>;  //    3 x f16 vector value
-def v4f16    : VTVec<4,    f16,  93>;  //    4 x f16 vector value
-def v8f16    : VTVec<8,    f16,  94>;  //    8 x f16 vector value
-def v16f16   : VTVec<16,   f16,  95>;  //   16 x f16 vector value
-def v32f16   : VTVec<32,   f16,  96>;  //   32 x f16 vector value
-def v64f16   : VTVec<64,   f16,  97>;  //   64 x f16 vector value
-def v128f16  : VTVec<128,  f16,  98>;  //  128 x f16 vector value
-def v256f16  : VTVec<256,  f16,  99>;  //  256 x f16 vector value
-def v512f16  : VTVec<512,  f16,  100>;  //  512 x f16 vector value
-def v4096f16 : VTVec<4096, f16,  101>;  // 4096 x f16 vector value
-
-def v1bf16    : VTVec<1,    bf16, 102>;  //    1 x bf16 vector value
-def v2bf16    : VTVec<2,    bf16, 103>;  //    2 x bf16 vector value
-def v3bf16    : VTVec<3,    bf16, 104>;  //    3 x bf16 vector value
-def v4bf16    : VTVec<4,    bf16, 105>;  //    4 x bf16 vector value
-def v8bf16    : VTVec<8,    bf16, 106>;  //    8 x bf16 vector value
-def v16bf16   : VTVec<16,   bf16, 107>;  //   16 x bf16 vector value
-def v32bf16   : VTVec<32,   bf16, 108>;  //   32 x bf16 vector value
-def v64bf16   : VTVec<64,   bf16, 109>;  //   64 x bf16 vector value
-def v128bf16  : VTVec<128,  bf16, 110>;  //  128 x bf16 vector value
-def v4096bf16 : VTVec<4096, bf16, 111>;  // 4096 x bf16 vector value
-
-def v1f32    : VTVec<1,    f32, 112>;  //    1 x f32 vector value
-def v2f32    : VTVec<2,    f32, 113>;  //    2 x f32 vector value
-def v3f32    : VTVec<3,    f32, 114>;  //    3 x f32 vector value
-def v4f32    : VTVec<4,    f32, 115>;  //    4 x f32 vector value
-def v5f32    : VTVec<5,    f32, 116>;  //    5 x f32 vector value
-def v6f32    : VTVec<6,    f32, 117>;  //    6 x f32 vector value
-def v7f32    : VTVec<7,    f32, 118>;  //    7 x f32 vector value
-def v8f32    : VTVec<8,    f32, 119>;  //    8 x f32 vector value
-def v9f32    : VTVec<9,    f32, 120>;  //    9 x f32 vector value
-def v10f32   : VTVec<10,   f32, 121>;  //   10 x f32 vector value
-def v11f32   : VTVec<11,   f32, 122>;  //   11 x f32 vector value
-def v12f32   : VTVec<12,   f32, 123>;  //   12 x f32 vector value
-def v16f32   : VTVec<16,   f32, 124>;  //   16 x f32 vector value
-def v32f32   : VTVec<32,   f32, 125>;  //   32 x f32 vector value
-def v64f32   : VTVec<64,   f32, 126>;  //   64 x f32 vector value
-def v128f32  : VTVec<128,  f32, 127>;  //  128 x f32 vector value
-def v256f32  : VTVec<256,  f32, 128>;  //  256 x f32 vector value
-def v512f32  : VTVec<512,  f32, 129>;  //  512 x f32 vector value
-def v1024f32 : VTVec<1024, f32, 130>;  // 1024 x f32 vector value
-def v2048f32 : VTVec<2048, f32, 131>;  // 2048 x f32 vector value
-
-def v1f64    : VTVec<1,    f64, 132>;  //    1 x f64 vector value
-def v2f64    : VTVec<2,    f64, 133>;  //    2 x f64 vector value
-def v3f64    : VTVec<3,    f64, 134>;  //    3 x f64 vector value
-def v4f64    : VTVec<4,    f64, 135>;  //    4 x f64 vector value
-def v8f64    : VTVec<8,    f64, 136>;  //    8 x f64 vector value
-def v16f64   : VTVec<16,   f64, 137>;  //   16 x f64 vector value
-def v32f64   : VTVec<32,   f64, 138>;  //   32 x f64 vector value
-def v64f64   : VTVec<64,   f64, 139>;  //   64 x f64 vector value
-def v128f64  : VTVec<128,  f64, 140>;  //  128 x f64 vector value
-def v256f64  : VTVec<256,  f64, 141>;  //  256 x f64 vector value
-
-def nxv1i1  : VTScalableVec<1,  i1, 142>;  // n x  1 x i1  vector value
-def nxv2i1  : VTScalableVec<2,  i1, 143>;  // n x  2 x i1  vector value
-def nxv4i1  : VTScalableVec<4,  i1, 144>;  // n x  4 x i1  vector value
-def nxv8i1  : VTScalableVec<8,  i1, 145>;  // n x  8 x i1  vector value
-def nxv16i1 : VTScalableVec<16, i1, 146>;  // n x 16 x i1  vector value
-def nxv32i1 : VTScalableVec<32, i1, 147>;  // n x 32 x i1  vector value
-def nxv64i1 : VTScalableVec<64, i1, 148>;  // n x 64 x i1  vector value
-
-def nxv1i8  : VTScalableVec<1,  i8, 149>;  // n x  1 x i8  vector value
-def nxv2i8  : VTScalableVec<2,  i8, 150>;  // n x  2 x i8  vector value
-def nxv4i8  : VTScalableVec<4,  i8, 151>;  // n x  4 x i8  vector value
-def nxv8i8  : VTScalableVec<8,  i8, 152>;  // n x  8 x i8  vector value
-def nxv16i8 : VTScalableVec<16, i8, 153>;  // n x 16 x i8  vector value
-def nxv32i8 : VTScalableVec<32, i8, 154>;  // n x 32 x i8  vector value
-def nxv64i8 : VTScalableVec<64, i8, 155>;  // n x 64 x i8  vector value
-
-def nxv1i16  : VTScalableVec<1,  i16, 156>;  // n x  1 x i16 vector value
-def nxv2i16  : VTScalableVec<2,  i16, 157>;  // n x  2 x i16 vector value
-def nxv4i16  : VTScalableVec<4,  i16, 158>;  // n x  4 x i16 vector value
-def nxv8i16  : VTScalableVec<8,  i16, 159>;  // n x  8 x i16 vector value
-def nxv16i16 : VTScalableVec<16, i16, 160>;  // n x 16 x i16 vector value
-def nxv32i16 : VTScalableVec<32, i16, 161>;  // n x 32 x i16 vector value
-
-def nxv1i32  : VTScalableVec<1,  i32, 162>;  // n x  1 x i32 vector value
-def nxv2i32  : VTScalableVec<2,  i32, 163>;  // n x  2 x i32 vector value
-def nxv4i32  : VTScalableVec<4,  i32, 164>;  // n x  4 x i32 vector value
-def nxv8i32  : VTScalableVec<8,  i32, 165>;  // n x  8 x i32 vector value
-def nxv16i32 : VTScalableVec<16, i32, 166>;  // n x 16 x i32 vector value
-def nxv32i32 : VTScalableVec<32, i32, 167>;  // n x 32 x i32 vector value
-
-def nxv1i64  : VTScalableVec<1,  i64, 168>;  // n x  1 x i64 vector value
-def nxv2i64  : VTScalableVec<2,  i64, 169>;  // n x  2 x i64 vector value
-def nxv4i64  : VTScalableVec<4,  i64, 170>;  // n x  4 x i64 vector value
-def nxv8i64  : VTScalableVec<8,  i64, 171>;  // n x  8 x i64 vector value
-def nxv16i64 : VTScalableVec<16, i64, 172>;  // n x 16 x i64 vector value
-def nxv32i64 : VTScalableVec<32, i64, 173>;  // n x 32 x i64 vector value
-
-def nxv1f16  : VTScalableVec<1,  f16, 174>;  // n x  1 x  f16 vector value
-def nxv2f16  : VTScalableVec<2,  f16, 175>;  // n x  2 x  f16 vector value
-def nxv4f16  : VTScalableVec<4,  f16, 176>;  // n x  4 x  f16 vector value
-def nxv8f16  : VTScalableVec<8,  f16, 177>;  // n x  8 x  f16 vector value
-def nxv16f16 : VTScalableVec<16, f16, 178>;  // n x 16 x  f16 vector value
-def nxv32f16 : VTScalableVec<32, f16, 179>;  // n x 32 x  f16 vector value
-
-def nxv1bf16  : VTScalableVec<1,  bf16, 180>;  // n x  1 x bf16 vector value
-def nxv2bf16  : VTScalableVec<2,  bf16, 181>;  // n x  2 x bf16 vector value
-def nxv4bf16  : VTScalableVec<4,  bf16, 182>;  // n x  4 x bf16 vector value
-def nxv8bf16  : VTScalableVec<8,  bf16, 183>;  // n x  8 x bf16 vector value
-def nxv16bf16 : VTScalableVec<16, bf16, 184>;  // n x 16 x bf16 vector value
-def nxv32bf16 : VTScalableVec<32, bf16, 185>;  // n x 32 x bf16 vector value
-
-def nxv1f32  : VTScalableVec<1,  f32, 186>;  // n x  1 x  f32 vector value
-def nxv2f32  : VTScalableVec<2,  f32, 187>;  // n x  2 x  f32 vector value
-def nxv4f32  : VTScalableVec<4,  f32, 188>;  // n x  4 x  f32 vector value
-def nxv8f32  : VTScalableVec<8,  f32, 189>;  // n x  8 x  f32 vector value
-def nxv16f32 : VTScalableVec<16, f32, 190>;  // n x 16 x  f32 vector value
-
-def nxv1f64  : VTScalableVec<1,  f64, 191>;  // n x  1 x  f64 vector value
-def nxv2f64  : VTScalableVec<2,  f64, 192>;  // n x  2 x  f64 vector value
-def nxv4f64  : VTScalableVec<4,  f64, 193>;  // n x  4 x  f64 vector value
-def nxv8f64  : VTScalableVec<8,  f64, 194>;  // n x  8 x  f64 vector value
+def v4096i32 : VTVec<4096, i32, 79>;  // 4096 x i32 vector value
+
+def v1i64   : VTVec<1,   i64, 80>;  //   1 x i64 vector value
+def v2i64   : VTVec<2,   i64, 81>;  //   2 x i64 vector value
+def v3i64   : VTVec<3,   i64, 82>;  //   3 x i64 vector value
+def v4i64   : VTVec<4,   i64, 83>;  //   4 x i64 vector value
+def v8i64   : VTVec<8,   i64, 84>;  //   8 x i64 vector value
+def v16i64  : VTVec<16,  i64, 85>;  //  16 x i64 vector value
+def v32i64  : VTVec<32,  i64, 86>;  //  32 x i64 vector value
+def v64i64  : VTVec<64,  i64, 87>;  //  64 x i64 vector value
+def v128i64 : VTVec<128, i64, 88>;  // 128 x i64 vector value
+def v256i64 : VTVec<256, i64, 89>;  // 256 x i64 vector value
+
+def v1i128  : VTVec<1,  i128, 90>;  //  1 x i128 vector value
+
+def v1f16    : VTVec<1,    f16,  91>;  //    1 x f16 vector value
+def v2f16    : VTVec<2,    f16,  92>;  //    2 x f16 vector value
+def v3f16    : VTVec<3,    f16,  93>;  //    3 x f16 vector value
+def v4f16    : VTVec<4,    f16,  94>;  //    4 x f16 vector value
+def v8f16    : VTVec<8,    f16,  95>;  //    8 x f16 vector value
+def v16f16   : VTVec<16,   f16,  96>;  //   16 x f16 vector value
+def v32f16   : VTVec<32,   f16,  97>;  //   32 x f16 vector value
+def v64f16   : VTVec<64,   f16,  98>;  //   64 x f16 vector value
+def v128f16  : VTVec<128,  f16,  99>;  //  128 x f16 vector value
+def v256f16  : VTVec<256,  f16, 100>;  //  256 x f16 vector value
+def v512f16  : VTVec<512,  f16, 101>;  //  512 x f16 vector value
+def v4096f16 : VTVec<4096, f16, 102>;  // 4096 x f16 vector value
+
+def v1bf16    : VTVec<1,    bf16, 103>;  //    1 x bf16 vector value
+def v2bf16    : VTVec<2,    bf16, 104>;  //    2 x bf16 vector value
+def v3bf16    : VTVec<3,    bf16, 105>;  //    3 x bf16 vector value
+def v4bf16    : VTVec<4,    bf16, 106>;  //    4 x bf16 vector value
+def v8bf16    : VTVec<8,    bf16, 107>;  //    8 x bf16 vector value
+def v16bf16   : VTVec<16,   bf16, 108>;  //   16 x bf16 vector value
+def v32bf16   : VTVec<32,   bf16, 109>;  //   32 x bf16 vector value
+def v64bf16   : VTVec<64,   bf16, 110>;  //   64 x bf16 vector value
+def v128bf16  : VTVec<128,  bf16, 111>;  //  128 x bf16 vector value
+def v4096bf16 : VTVec<4096, bf16, 112>;  // 4096 x bf16 vector value
+
+def v1f32    : VTVec<1,    f32, 113>;  //    1 x f32 vector value
+def v2f32    : VTVec<2,    f32, 114>;  //    2 x f32 vector value
+def v3f32    : VTVec<3,    f32, 115>;  //    3 x f32 vector value
+def v4f32    : VTVec<4,    f32, 116>;  //    4 x f32 vector value
+def v5f32    : VTVec<5,    f32, 117>;  //    5 x f32 vector value
+def v6f32    : VTVec<6,    f32, 118>;  //    6 x f32 vector value
+def v7f32    : VTVec<7,    f32, 119>;  //    7 x f32 vector value
+def v8f32    : VTVec<8,    f32, 120>;  //    8 x f32 vector value
+def v9f32    : VTVec<9,    f32, 121>;  //    9 x f32 vector value
+def v10f32   : VTVec<10,   f32, 122>;  //   10 x f32 vector value
+def v11f32   : VTVec<11,   f32, 123>;  //   11 x f32 vector value
+def v12f32   : VTVec<12,   f32, 124>;  //   12 x f32 vector value
+def v16f32   : VTVec<16,   f32, 125>;  //   16 x f32 vector value
+def v32f32   : VTVec<32,   f32, 126>;  //   32 x f32 vector value
+def v64f32   : VTVec<64,   f32, 127>;  //   64 x f32 vector value
+def v128f32  : VTVec<128,  f32, 128>;  //  128 x f32 vector value
+def v256f32  : VTVec<256,  f32, 129>;  //  256 x f32 vector value
+def v512f32  : VTVec<512,  f32, 130>;  //  512 x f32 vector value
+def v1024f32 : VTVec<1024, f32, 131>;  // 1024 x f32 vector value
+def v2048f32 : VTVec<2048, f32, 132>;  // 2048 x f32 vector value
+
+def v1f64    : VTVec<1,    f64, 133>;  //    1 x f64 vector value
+def v2f64    : VTVec<2,    f64, 134>;  //    2 x f64 vector value
+def v3f64    : VTVec<3,    f64, 135>;  //    3 x f64 vector value
+def v4f64    : VTVec<4,    f64, 136>;  //    4 x f64 vector value
+def v8f64    : VTVec<8,    f64, 137>;  //    8 x f64 vector value
+def v16f64   : VTVec<16,   f64, 138>;  //   16 x f64 vector value
+def v32f64   : VTVec<32,   f64, 139>;  //   32 x f64 vector value
+def v64f64   : VTVec<64,   f64, 140>;  //   64 x f64 vector value
+def v128f64  : VTVec<128,  f64, 141>;  //  128 x f64 vector value
+def v256f64  : VTVec<256,  f64, 142>;  //  256 x f64 vector value
+
+def nxv1i1  : VTScalableVec<1,  i1, 143>;  // n x  1 x i1  vector value
+def nxv2i1  : VTScalableVec<2,  i1, 144>;  // n x  2 x i1  vector value
+def nxv4i1  : VTScalableVec<4,  i1, 145>;  // n x  4 x i1  vector value
+def nxv8i1  : VTScalableVec<8,  i1, 146>;  // n x  8 x i1  vector value
+def nxv16i1 : VTScalableVec<16, i1, 147>;  // n x 16 x i1  vector value
+def nxv32i1 : VTScalableVec<32, i1, 148>;  // n x 32 x i1  vector value
+def nxv64i1 : VTScalableVec<64, i1, 149>;  // n x 64 x i1  vector value
+
+def nxv1i8  : VTScalableVec<1,  i8, 150>;  // n x  1 x i8  vector value
+def nxv2i8  : VTScalableVec<2,  i8, 151>;  // n x  2 x i8  vector value
+def nxv4i8  : VTScalableVec<4,  i8, 152>;  // n x  4 x i8  vector value
+def nxv8i8  : VTScalableVec<8,  i8, 153>;  // n x  8 x i8  vector value
+def nxv16i8 : VTScalableVec<16, i8, 154>;  // n x 16 x i8  vector value
+def nxv32i8 : VTScalableVec<32, i8, 155>;  // n x 32 x i8  vector value
+def nxv64i8 : VTScalableVec<64, i8, 156>;  // n x 64 x i8  vector value
+
+def nxv1i16  : VTScalableVec<1,  i16, 157>;  // n x  1 x i16 vector value
+def nxv2i16  : VTScalableVec<2,  i16, 158>;  // n x  2 x i16 vector value
+def nxv4i16  : VTScalableVec<4,  i16, 159>;  // n x  4 x i16 vector value
+def nxv8i16  : VTScalableVec<8,  i16, 160>;  // n x  8 x i16 vector value
+def nxv16i16 : VTScalableVec<16, i16, 161>;  // n x 16 x i16 vector value
+def nxv32i16 : VTScalableVec<32, i16, 162>;  // n x 32 x i16 vector value
+
+def nxv1i32  : VTScalableVec<1,  i32, 163>;  // n x  1 x i32 vector value
+def nxv2i32  : VTScalableVec<2,  i32, 164>;  // n x  2 x i32 vector value
+def nxv4i32  : VTScalableVec<4,  i32, 165>;  // n x  4 x i32 vector value
+def nxv8i32  : VTScalableVec<8,  i32, 166>;  // n x  8 x i32 vector value
+def nxv16i32 : VTScalableVec<16, i32, 167>;  // n x 16 x i32 vector value
+def nxv32i32 : VTScalableVec<32, i32, 168>;  // n x 32 x i32 vector value
+
+def nxv1i64  : VTScalableVec<1,  i64, 169>;  // n x  1 x i64 vector value
+def nxv2i64  : VTScalableVec<2,  i64, 170>;  // n x  2 x i64 vector value
+def nxv4i64  : VTScalableVec<4,  i64, 171>;  // n x  4 x i64 vector value
+def nxv8i64  : VTScalableVec<8,  i64, 172>;  // n x  8 x i64 vector value
+def nxv16i64 : VTScalableVec<16, i64, 173>;  // n x 16 x i64 vector value
+def nxv32i64 : VTScalableVec<32, i64, 174>;  // n x 32 x i64 vector value
+
+def nxv1f16  : VTScalableVec<1,  f16, 175>;  // n x  1 x  f16 vector value
+def nxv2f16  : VTScalableVec<2,  f16, 176>;  // n x  2 x  f16 vector value
+def nxv4f16  : VTScalableVec<4,  f16, 177>;  // n x  4 x  f16 vector value
+def nxv8f16  : VTScalableVec<8,  f16, 178>;  // n x  8 x  f16 vector value
+def nxv16f16 : VTScalableVec<16, f16, 179>;  // n x 16 x  f16 vector value
+def nxv32f16 : VTScalableVec<32, f16, 180>;  // n x 32 x  f16 vector value
+
+def nxv1bf16  : VTScalableVec<1,  bf16, 181>;  // n x  1 x bf16 vector value
+def nxv2bf16  : VTScalableVec<2,  bf16, 182>;  // n x  2 x bf16 vector value
+def nxv4bf16  : VTScalableVec<4,  bf16, 183>;  // n x  4 x bf16 vector value
+def nxv8bf16  : VTScalableVec<8,  bf16, 184>;  // n x  8 x bf16 vector value
+def nxv16bf16 : VTScalableVec<16, bf16, 185>;  // n x 16 x bf16 vector value
+def nxv32bf16 : VTScalableVec<32, bf16, 186>;  // n x 32 x bf16 vector value
+
+def nxv1f32  : VTScalableVec<1,  f32, 187>;  // n x  1 x  f32 vector value
+def nxv2f32  : VTScalableVec<2,  f32, 188>;  // n x  2 x  f32 vector value
+def nxv4f32  : VTScalableVec<4,  f32, 189>;  // n x  4 x  f32 vector value
+def nxv8f32  : VTScalableVec<8,  f32, 190>;  // n x  8 x  f32 vector value
+def nxv16f32 : VTScalableVec<16, f32, 191>;  // n x 16 x  f32 vector value
+
+def nxv1f64  : VTScalableVec<1,  f64, 192>;  // n x  1 x  f64 vector value
+def nxv2f64  : VTScalableVec<2,  f64, 193>;  // n x  2 x  f64 vector value
+def nxv4f64  : VTScalableVec<4,  f64, 194>;  // n x  4 x  f64 vector value
+def nxv8f64  : VTScalableVec<8,  f64, 195>;  // n x  8 x  f64 vector value
 
 // Sz = NF * MinNumElts * 8(bits)
-def riscv_nxv1i8x2   : VTVecTup<16,  2, i8, 195>;  // RISCV vector tuple(min_num_elts=1,  nf=2)
-def riscv_nxv1i8x3   : VTVecTup<24,  3, i8, 196>;  // RISCV vector tuple(min_num_elts=1,  nf=3)
-def riscv_nxv1i8x4   : VTVecTup<32,  4, i8, 197>;  // RISCV vector tuple(min_num_elts=1,  nf=4)
-def riscv_nxv1i8x5   : VTVecTup<40,  5, i8, 198>;  // RISCV vector tuple(min_num_elts=1,  nf=5)
-def riscv_nxv1i8x6   : VTVecTup<48,  6, i8, 199>;  // RISCV vector tuple(min_num_elts=1,  nf=6)
-def riscv_nxv1i8x7   : VTVecTup<56,  7, i8, 200>;  // RISCV vector tuple(min_num_elts=1,  nf=7)
-def riscv_nxv1i8x8   : VTVecTup<64,  8, i8, 201>;  // RISCV vector tuple(min_num_elts=1,  nf=8)
-def riscv_nxv2i8x2   : VTVecTup<32,  2, i8, 202>;  // RISCV vector tuple(min_num_elts=2,  nf=2)
-def riscv_nxv2i8x3   : VTVecTup<48,  3, i8, 203>;  // RISCV vector tuple(min_num_elts=2,  nf=3)
-def riscv_nxv2i8x4   : VTVecTup<64,  4, i8, 204>;  // RISCV vector tuple(min_num_elts=2,  nf=4)
-def riscv_nxv2i8x5   : VTVecTup<80,  5, i8, 205>;  // RISCV vector tuple(min_num_elts=2,  nf=5)
-def riscv_nxv2i8x6   : VTVecTup<96,  6, i8, 206>;  // RISCV vector tuple(min_num_elts=2,  nf=6)
-def riscv_nxv2i8x7   : VTVecTup<112, 7, i8, 207>;  // RISCV vector tuple(min_num_elts=2,  nf=7)
-def riscv_nxv2i8x8   : VTVecTup<128, 8, i8, 208>;  // RISCV vector tuple(min_num_elts=2,  nf=8)
-def riscv_nxv4i8x2   : VTVecTup<64,  2, i8, 209>;  // RISCV vector tuple(min_num_elts=4,  nf=2)
-def riscv_nxv4i8x3   : VTVecTup<96,  3, i8, 210>;  // RISCV vector tuple(min_num_elts=4,  nf=3)
-def riscv_nxv4i8x4   : VTVecTup<128, 4, i8, 211>;  // RISCV vector tuple(min_num_elts=4,  nf=4)
-def riscv_nxv4i8x5   : VTVecTup<160, 5, i8, 212>;  // RISCV vector tuple(min_num_elts=4,  nf=5)
-def riscv_nxv4i8x6   : VTVecTup<192, 6, i8, 213>;  // RISCV vector tuple(min_num_elts=4,  nf=6)
-def riscv_nxv4i8x7   : VTVecTup<224, 7, i8, 214>;  // RISCV vector tuple(min_num_elts=4,  nf=7)
-def riscv_nxv4i8x8   : VTVecTup<256, 8, i8, 215>;  // RISCV vector tuple(min_num_elts=4,  nf=8)
-def riscv_nxv8i8x2   : VTVecTup<128, 2, i8, 216>;  // RISCV vector tuple(min_num_elts=8,  nf=2)
-def riscv_nxv8i8x3   : VTVecTup<192, 3, i8, 217>;  // RISCV vector tuple(min_num_elts=8,  nf=3)
-def riscv_nxv8i8x4   : VTVecTup<256, 4, i8, 218>;  // RISCV vector tuple(min_num_elts=8,  nf=4)
-def riscv_nxv8i8x5   : VTVecTup<320, 5, i8, 219>;  // RISCV vector tuple(min_num_elts=8,  nf=5)
-def riscv_nxv8i8x6   : VTVecTup<384, 6, i8, 220>;  // RISCV vector tuple(min_num_elts=8,  nf=6)
-def riscv_nxv8i8x7   : VTVecTup<448, 7, i8, 221>;  // RISCV vector tuple(min_num_elts=8,  nf=7)
-def riscv_nxv8i8x8   : VTVecTup<512, 8, i8, 222>;  // RISCV vector tuple(min_num_elts=8,  nf=8)
-def riscv_nxv16i8x2  : VTVecTup<256, 2, i8, 223>;  // RISCV vector tuple(min_num_elts=16, nf=2)
-def riscv_nxv16i8x3  : VTVecTup<384, 3, i8, 224>;  // RISCV vector tuple(min_num_elts=16, nf=3)
-def riscv_nxv16i8x4  : VTVecTup<512, 4, i8, 225>;  // RISCV vector tuple(min_num_elts=16, nf=4)
-def riscv_nxv32i8x2  : VTVecTup<512, 2, i8, 226>;  // RISCV vector tuple(min_num_elts=32, nf=2)
-
-def x86mmx    : ValueType<64,   227>;  // X86 MMX value
-def Glue      : ValueType<0,    228>;  // Pre-RA sched glue
-def isVoid    : ValueType<0,    229>;  // Produces no value
-def untyped   : ValueType<8,    230> { // Produces an untyped value
+def riscv_nxv1i8x2   : VTVecTup<16,  2, i8, 196>;  // RISCV vector tuple(min_num_elts=1,  nf=2)
+def riscv_nxv1i8x3   : VTVecTup<24,  3, i8, 197>;  // RISCV vector tuple(min_num_elts=1,  nf=3)
+def riscv_nxv1i8x4   : VTVecTup<32,  4, i8, 198>;  // RISCV vector tuple(min_num_elts=1,  nf=4)
+def riscv_nxv1i8x5   : VTVecTup<40,  5, i8, 199>;  // RISCV vector tuple(min_num_elts=1,  nf=5)
+def riscv_nxv1i8x6   : VTVecTup<48,  6, i8, 200>;  // RISCV vector tuple(min_num_elts=1,  nf=6)
+def riscv_nxv1i8x7   : VTVecTup<56,  7, i8, 201>;  // RISCV vector tuple(min_num_elts=1,  nf=7)
+def riscv_nxv1i8x8   : VTVecTup<64,  8, i8, 202>;  // RISCV vector tuple(min_num_elts=1,  nf=8)
+def riscv_nxv2i8x2   : VTVecTup<32,  2, i8, 203>;  // RISCV vector tuple(min_num_elts=2,  nf=2)
+def riscv_nxv2i8x3   : VTVecTup<48,  3, i8, 204>;  // RISCV vector tuple(min_num_elts=2,  nf=3)
+def riscv_nxv2i8x4   : VTVecTup<64,  4, i8, 205>;  // RISCV vector tuple(min_num_elts=2,  nf=4)
+def riscv_nxv2i8x5   : VTVecTup<80,  5, i8, 206>;  // RISCV vector tuple(min_num_elts=2,  nf=5)
+def riscv_nxv2i8x6   : VTVecTup<96,  6, i8, 207>;  // RISCV vector tuple(min_num_elts=2,  nf=6)
+def riscv_nxv2i8x7   : VTVecTup<112, 7, i8, 208>;  // RISCV vector tuple(min_num_elts=2,  nf=7)
+def riscv_nxv2i8x8   : VTVecTup<128, 8, i8, 209>;  // RISCV vector tuple(min_num_elts=2,  nf=8)
+def riscv_nxv4i8x2   : VTVecTup<64,  2, i8, 210>;  // RISCV vector tuple(min_num_elts=4,  nf=2)
+def riscv_nxv4i8x3   : VTVecTup<96,  3, i8, 211>;  // RISCV vector tuple(min_num_elts=4,  nf=3)
+def riscv_nxv4i8x4   : VTVecTup<128, 4, i8, 212>;  // RISCV vector tuple(min_num_elts=4,  nf=4)
+def riscv_nxv4i8x5   : VTVecTup<160, 5, i8, 213>;  // RISCV vector tuple(min_num_elts=4,  nf=5)
+def riscv_nxv4i8x6   : VTVecTup<192, 6, i8, 214>;  // RISCV vector tuple(min_num_elts=4,  nf=6)
+def riscv_nxv4i8x7   : VTVecTup<224, 7, i8, 215>;  // RISCV vector tuple(min_num_elts=4,  nf=7)
+def riscv_nxv4i8x8   : VTVecTup<256, 8, i8, 216>;  // RISCV vector tuple(min_num_elts=4,  nf=8)
+def riscv_nxv8i8x2   : VTVecTup<128, 2, i8, 217>;  // RISCV vector tuple(min_num_elts=8,  nf=2)
+def riscv_nxv8i8x3   : VTVecTup<192, 3, i8, 218>;  // RISCV vector tuple(min_num_elts=8,  nf=3)
+def riscv_nxv8i8x4   : VTVecTup<256, 4, i8, 219>;  // RISCV vector tuple(min_num_elts=8,  nf=4)
+def riscv_nxv8i8x5   : VTVecTup<320, 5, i8, 220>;  // RISCV vector tuple(min_num_elts=8,  nf=5)
+def riscv_nxv8i8x6   : VTVecTup<384, 6, i8, 221>;  // RISCV vector tuple(min_num_elts=8,  nf=6)
+def riscv_nxv8i8x7   : VTVecTup<448, 7, i8, 222>;  // RISCV vector tuple(min_num_elts=8,  nf=7)
+def riscv_nxv8i8x8   : VTVecTup<512, 8, i8, 223>;  // RISCV vector tuple(min_num_elts=8,  nf=8)
+def riscv_nxv16i8x2  : VTVecTup<256, 2, i8, 224>;  // RISCV vector tuple(min_num_elts=16, nf=2)
+def riscv_nxv16i8x3  : VTVecTup<384, 3, i8, 225>;  // RISCV vector tuple(min_num_elts=16, nf=3)
+def riscv_nxv16i8x4  : VTVecTup<512, 4, i8, 226>;  // RISCV vector tuple(min_num_elts=16, nf=4)
+def riscv_nxv32i8x2  : VTVecTup<512, 2, i8, 227>;  // RISCV vector tuple(min_num_elts=32, nf=2)
+
+def x86mmx    : ValueType<64,   228>;  // X86 MMX value
+def Glue      : ValueType<0,    229>;  // Pre-RA sched glue
+def isVoid    : ValueType<0,    230>;  // Produces no value
+def untyped   : ValueType<8,    231> { // Produces an untyped value
   let LLVMName = "Untyped";
 }
-def funcref   : ValueType<0,    231>;  // WebAssembly's funcref type
-def externref : ValueType<0,    232>;  // WebAssembly's externref type
-def exnref    : ValueType<0,    233>;  // WebAssembly's exnref type
-def x86amx    : ValueType<8192, 234>;  // X86 AMX value
-def i64x8     : ValueType<512,  235>;  // 8 Consecutive GPRs (AArch64)
+def funcref   : ValueType<0,    232>;  // WebAssembly's funcref type
+def externref : ValueType<0,    233>;  // WebAssembly's externref type
+def exnref    : ValueType<0,    234>;  // WebAssembly's exnref type
+def x86amx    : ValueType<8192, 235>;  // X86 AMX value
+def i64x8     : ValueType<512,  236>;  // 8 Consecutive GPRs (AArch64)
 def aarch64svcount
-              : ValueType<16,  236>;  // AArch64 predicate-as-counter
-def spirvbuiltin : ValueType<0, 237>; // SPIR-V's builtin type
+              : ValueType<16,  237>;  // AArch64 predicate-as-counter
+def spirvbuiltin : ValueType<0, 238>; // SPIR-V's builtin type
 // AMDGPU buffer fat pointer, buffer rsrc + offset, rewritten before MIR translation.
 // FIXME: Remove this and the getPointerType() override if MVT::i160 is added.
-def amdgpuBufferFatPointer : ValueType<160, 238>;
+def amdgpuBufferFatPointer : ValueType<160, 239>;
 // AMDGPU buffer strided pointer, buffer rsrc + index + offset, doesn't reach MIR.
 // FIXME: Remove this and the getPointerType() override if MVT::i82 is added.
-def amdgpuBufferStridedPointer : ValueType<192, 239>;
+def amdgpuBufferStridedPointer : ValueType<192, 240>;
 
-def aarch64mfp8 : ValueType<8,  240>;  // 8-bit value in FPR (AArch64)
+def aarch64mfp8 : ValueType<8,  241>;  // 8-bit value in FPR (AArch64)
 
 let isNormalValueType = false in {
 def token      : ValueType<0, 504>;  // TokenTy
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h
index cd5e271..44432d3 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h
@@ -77,6 +77,9 @@ public:
 
   private:
     friend class DWARFExpression::iterator;
+    friend class DWARFExpressionPrinter;
+    friend class DWARFVerifier;
+
     uint8_t Opcode; ///< The Op Opcode, DW_OP_<something>.
     Description Desc;
     bool Error = false;
@@ -99,11 +102,6 @@ public:
     }
     uint64_t getEndOffset() const { return EndOffset; }
     bool isError() const { return Error; }
-    LLVM_ABI bool print(raw_ostream &OS, DIDumpOptions DumpOpts,
-                        const DWARFExpression *Expr, DWARFUnit *U) const;
-
-    /// Verify \p Op. Does not affect the return of \a isError().
-    LLVM_ABI static bool verify(const Operation &Op, DWARFUnit *U);
 
   private:
     LLVM_ABI bool extract(DataExtractor Data, uint8_t AddressSize,
@@ -154,28 +152,12 @@ public:
   iterator begin() const { return iterator(this, 0); }
   iterator end() const { return iterator(this, Data.getData().size()); }
 
-  LLVM_ABI void print(raw_ostream &OS, DIDumpOptions DumpOpts, DWARFUnit *U,
-                      bool IsEH = false) const;
-
-  /// Print the expression in a format intended to be compact and useful to a
-  /// user, but not perfectly unambiguous, or capable of representing every
-  /// valid DWARF expression. Returns true if the expression was sucessfully
-  /// printed.
-  LLVM_ABI bool printCompact(
-      raw_ostream &OS,
-      std::function<StringRef(uint64_t RegNum, bool IsEH)> GetNameForDWARFReg =
-          nullptr);
-
-  LLVM_ABI bool verify(DWARFUnit *U);
-
   LLVM_ABI bool operator==(const DWARFExpression &RHS) const;
 
   StringRef getData() const { return Data.getData(); }
 
-  LLVM_ABI static bool prettyPrintRegisterOp(DWARFUnit *U, raw_ostream &OS,
-                                             DIDumpOptions DumpOpts,
-                                             uint8_t Opcode,
-                                             const ArrayRef<uint64_t> Operands);
+  friend class DWARFExpressionPrinter;
+  friend class DWARFVerifier;
 
 private:
   DataExtractor Data;
@@ -187,5 +169,63 @@ inline bool operator==(const DWARFExpression::iterator &LHS,
                        const DWARFExpression::iterator &RHS) {
   return LHS.Expr == RHS.Expr && LHS.Offset == RHS.Offset;
 }
-}
-#endif
+
+// This functionality is separated from the main data structure so that nothing
+// in DWARFExpression.cpp needs build-time dependencies on DWARFUnit or other
+// higher-level Dwarf structures. This approach creates better layering and
+// allows DWARFExpression to be used from code which can't have dependencies on
+// those higher-level structures.
+
+class DWARFUnit;
+struct DIDumpOptions;
+class raw_ostream;
+
+class DWARFExpressionPrinter {
+public:
+  /// Print a Dwarf expression/
+  /// \param E to be printed
+  /// \param OS to this stream
+  /// \param GetNameForDWARFReg callback to return dwarf register name
+  static void print(const DWARFExpression *E, raw_ostream &OS,
+                    DIDumpOptions DumpOpts, DWARFUnit *U, bool IsEH = false);
+
+  /// Print the expression in a format intended to be compact and useful to a
+  /// user, but not perfectly unambiguous, or capable of representing every
+  /// valid DWARF expression. Returns true if the expression was sucessfully
+  /// printed.
+  ///
+  /// \param E to be printed
+  /// \param OS to this stream
+  /// \param GetNameForDWARFReg callback to return dwarf register name
+  ///
+  /// \returns true if the expression was successfully printed
+  static bool printCompact(const DWARFExpression *E, raw_ostream &OS,
+                           std::function<StringRef(uint64_t RegNum, bool IsEH)>
+                               GetNameForDWARFReg = nullptr);
+
+  /// Pretty print a register opcode and operands.
+  /// \param U within the context of this Dwarf unit, if any.
+  /// \param OS to this stream
+  /// \param DumpOpts with these options
+  /// \param Opcode to print
+  /// \param Operands to the opcode
+  ///
+  /// returns true if the Op was successfully printed
+  static bool prettyPrintRegisterOp(DWARFUnit *U, raw_ostream &OS,
+                                    DIDumpOptions DumpOpts, uint8_t Opcode,
+                                    ArrayRef<uint64_t> Operands);
+
+private:
+  static bool printOp(const DWARFExpression::Operation *Op, raw_ostream &OS,
+                      DIDumpOptions DumpOpts, const DWARFExpression *Expr,
+                      DWARFUnit *U);
+
+  static void prettyPrintBaseTypeRef(DWARFUnit *U, raw_ostream &OS,
+                                     DIDumpOptions DumpOpts,
+                                     ArrayRef<uint64_t> Operands,
+                                     unsigned Operand);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_DWARF_DWARFEXPRESSION_H
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
index 965f494..8e1b958 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
@@ -14,6 +14,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
 #include "llvm/DebugInfo/DWARF/DWARFAddressRange.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
+#include "llvm/DebugInfo/DWARF/DWARFExpression.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
 #include "llvm/Support/Compiler.h"
 #include <cstdint>
@@ -323,6 +324,23 @@ private:
   void verifyDebugNames(const DWARFSection &AccelSection,
                         const DataExtractor &StrData);
 
+  /// Verify that the the expression is valid within the context of unit U.
+  ///
+  /// \param E expression to verify.
+  /// \param U containing DWARFUnit, if any.
+  ///
+  /// returns true if E is a valid expression.
+  bool verifyExpression(const DWARFExpression &E, DWARFUnit *U);
+
+  /// Verify that the the expression operation is valid within the context of
+  /// unit U.
+  ///
+  /// \param Op operation to verify
+  /// \param U containing DWARFUnit, if any
+  ///
+  /// returns true if Op is a valid Dwarf operation
+  bool verifyExpressionOp(const DWARFExpression::Operation &Op, DWARFUnit *U);
+
 public:
   LLVM_ABI
   DWARFVerifier(raw_ostream &S, DWARFContext &D,
diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h
index b5ad0de..07444cd 100644
--- a/llvm/include/llvm/IR/BasicBlock.h
+++ b/llvm/include/llvm/IR/BasicBlock.h
@@ -87,8 +87,7 @@ public:
 
   /// Convert variable location debugging information stored in dbg.value
   /// intrinsics into DbgMarkers / DbgRecords. Deletes all dbg.values in
-  /// the process and sets IsNewDbgInfoFormat = true. Only takes effect if
-  /// the UseNewDbgInfoFormat LLVM command line option is given.
+  /// the process and sets IsNewDbgInfoFormat = true.
   LLVM_ABI void convertToNewDbgValues();
 
   /// Convert variable location debugging information stored in DbgMarkers and
diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h
index 4d6bb1c..fa62bc0 100644
--- a/llvm/include/llvm/IR/DerivedTypes.h
+++ b/llvm/include/llvm/IR/DerivedTypes.h
@@ -845,6 +845,8 @@ public:
     /// This type may be allocated on the stack, either as the allocated type
     /// of an alloca instruction or as a byval function parameter.
     CanBeLocal = 1U << 2,
+    // This type may be used as an element in a vector.
+    CanBeVectorElement = 1U << 3,
   };
 
   /// Returns true if the target extension type contains the given property.
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index b37c6c6..c2510ea 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -1038,6 +1038,21 @@ public:
   /// Return value: true =>  null pointer dereference is not undefined.
   bool nullPointerIsDefined() const;
 
+  /// Returns the alignment of the given function.
+  ///
+  /// Note that this is the alignment of the code, not the alignment of a
+  /// function pointer.
+  MaybeAlign getAlign() const { return GlobalObject::getAlign(); }
+
+  /// Sets the alignment attribute of the Function.
+  void setAlignment(Align Align) { GlobalObject::setAlignment(Align); }
+
+  /// Sets the alignment attribute of the Function.
+  ///
+  /// This method will be deprecated as the alignment property should always be
+  /// defined.
+  void setAlignment(MaybeAlign Align) { GlobalObject::setAlignment(Align); }
+
 private:
   void allocHungoffUselist();
   template<int Idx> void setHungoffOperand(Constant *C);
diff --git a/llvm/include/llvm/IR/GlobalObject.h b/llvm/include/llvm/IR/GlobalObject.h
index 557add9..08a02b4 100644
--- a/llvm/include/llvm/IR/GlobalObject.h
+++ b/llvm/include/llvm/IR/GlobalObject.h
@@ -67,12 +67,7 @@ private:
 public:
   GlobalObject(const GlobalObject &) = delete;
 
-  /// FIXME: Remove this function once transition to Align is over.
-  uint64_t getAlignment() const {
-    MaybeAlign Align = getAlign();
-    return Align ? Align->value() : 0;
-  }
-
+protected:
   /// Returns the alignment of the given variable or function.
   ///
   /// Note that for functions this is the alignment of the code, not the
@@ -103,6 +98,7 @@ public:
     assert(getGlobalObjectSubClassData() == Val && "representation error");
   }
 
+public:
   /// Check if this global has a custom object file section.
   ///
   /// This is more efficient than calling getSection() and checking for an empty
diff --git a/llvm/include/llvm/IR/GlobalVariable.h b/llvm/include/llvm/IR/GlobalVariable.h
index a411897f..388e1d7 100644
--- a/llvm/include/llvm/IR/GlobalVariable.h
+++ b/llvm/include/llvm/IR/GlobalVariable.h
@@ -298,6 +298,23 @@ public:
   ///
   LLVM_ABI void clearCodeModel();
 
+  /// FIXME: Remove this function once transition to Align is over.
+  uint64_t getAlignment() const {
+    MaybeAlign Align = getAlign();
+    return Align ? Align->value() : 0;
+  }
+
+  /// Returns the alignment of the given variable.
+  MaybeAlign getAlign() const { return GlobalObject::getAlign(); }
+
+  /// Sets the alignment attribute of the GlobalVariable.
+  void setAlignment(Align Align) { GlobalObject::setAlignment(Align); }
+
+  /// Sets the alignment attribute of the GlobalVariable.
+  /// This method will be deprecated as the alignment property should always be
+  /// defined.
+  void setAlignment(MaybeAlign Align) { GlobalObject::setAlignment(Align); }
+
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Value *V) {
     return V->getValueID() == Value::GlobalVariableVal;
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index d389905..e68243c2 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -335,8 +335,8 @@ def IIT_I4 : IIT_Int<4, 58>;
 def IIT_AARCH64_SVCOUNT : IIT_VT<aarch64svcount, 59>;
 def IIT_V6 : IIT_Vec<6, 60>;
 def IIT_V10 : IIT_Vec<10, 61>;
-def IIT_V2048: IIT_Vec<2048, 62>;
-def IIT_V4096: IIT_Vec<4096, 63>;
+def IIT_V2048 : IIT_Vec<2048, 62>;
+def IIT_V4096 : IIT_Vec<4096, 63>;
 }
 
 defvar IIT_all_FixedTypes = !filter(iit, IIT_all,
@@ -575,6 +575,7 @@ def llvm_v64i32_ty     : LLVMType<v64i32>;    //   64 x i32
 def llvm_v128i32_ty    : LLVMType<v128i32>;   //  128 x i32
 def llvm_v256i32_ty    : LLVMType<v256i32>;   //  256 x i32
 def llvm_v2048i32_ty   : LLVMType<v2048i32>;  // 2048 x i32
+def llvm_v4096i32_ty   : LLVMType<v4096i32>;  // 4096 x i32
 
 def llvm_v1i64_ty      : LLVMType<v1i64>;    //  1 x i64
 def llvm_v2i64_ty      : LLVMType<v2i64>;    //  2 x i64
@@ -605,7 +606,7 @@ def llvm_v8f32_ty      : LLVMType<v8f32>;       //    8 x float
 def llvm_v10f32_ty     : LLVMType<v10f32>;      //   10 x float
 def llvm_v16f32_ty     : LLVMType<v16f32>;      //   16 x float
 def llvm_v32f32_ty     : LLVMType<v32f32>;      //   32 x float
-def llvm_v2048f32_ty     : LLVMType<v2048f32>;  // 2048 x float
+def llvm_v2048f32_ty   : LLVMType<v2048f32>;    // 2048 x float
 def llvm_v1f64_ty      : LLVMType<v1f64>;       //    1 x double
 def llvm_v2f64_ty      : LLVMType<v2f64>;       //    2 x double
 def llvm_v4f64_ty      : LLVMType<v4f64>;       //    4 x double
diff --git a/llvm/include/llvm/IR/PassManagerImpl.h b/llvm/include/llvm/IR/PassManagerImpl.h
index 04ccbbb..fe7b35f 100644
--- a/llvm/include/llvm/IR/PassManagerImpl.h
+++ b/llvm/include/llvm/IR/PassManagerImpl.h
@@ -22,8 +22,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/PrettyStackTrace.h"
 
-LLVM_ABI extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
-
 namespace llvm {
 
 template <typename IRUnitT, typename AnalysisManagerT, typename... ExtraArgTs>
@@ -67,7 +65,7 @@ PreservedAnalyses PassManager<IRUnitT, AnalysisManagerT, ExtraArgTs...>::run(
 
   // RemoveDIs: if requested, convert debug-info to DbgRecord representation
   // for duration of these passes.
-  ScopedDbgInfoFormatSetter FormatSetter(IR, UseNewDbgInfoFormat);
+  ScopedDbgInfoFormatSetter FormatSetter(IR, true);
 
   StackTraceEntry Entry(PI, IR);
   for (auto &Pass : Passes) {
diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index 683193a..91c6872 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -835,8 +835,7 @@ struct LineLocation {
   LineLocation(uint32_t L, uint32_t D) : LineOffset(L), Column(D) {}
 
   bool operator<(const LineLocation &O) const {
-    return LineOffset < O.LineOffset ||
-           (LineOffset == O.LineOffset && Column < O.Column);
+    return std::tie(LineOffset, Column) < std::tie(O.LineOffset, O.Column);
   }
 
   bool operator==(const LineLocation &O) const {
diff --git a/llvm/include/llvm/SandboxIR/Constant.h b/llvm/include/llvm/SandboxIR/Constant.h
index 2012cf8..e7b18a4 100644
--- a/llvm/include/llvm/SandboxIR/Constant.h
+++ b/llvm/include/llvm/SandboxIR/Constant.h
@@ -976,32 +976,6 @@ public:
     }
   }
 
-  /// FIXME: Remove this function once transition to Align is over.
-  uint64_t getAlignment() const {
-    return cast<llvm::GlobalObject>(Val)->getAlignment();
-  }
-
-  /// Returns the alignment of the given variable or function.
-  ///
-  /// Note that for functions this is the alignment of the code, not the
-  /// alignment of a function pointer.
-  MaybeAlign getAlign() const {
-    return cast<llvm::GlobalObject>(Val)->getAlign();
-  }
-
-  // TODO: Add missing: setAlignment(Align)
-
-  /// Sets the alignment attribute of the GlobalObject.
-  /// This method will be deprecated as the alignment property should always be
-  /// defined.
-  void setAlignment(MaybeAlign Align);
-
-  unsigned getGlobalObjectSubClassData() const {
-    return cast<llvm::GlobalObject>(Val)->getGlobalObjectSubClassData();
-  }
-
-  void setGlobalObjectSubClassData(unsigned V);
-
   /// Check if this global has a custom object file section.
   ///
   /// This is more efficient than calling getSection() and checking for an empty
@@ -1294,6 +1268,18 @@ public:
     return cast<llvm::GlobalVariable>(Val)->getCodeModel();
   }
 
+  /// Returns the alignment of the given variable.
+  MaybeAlign getAlign() const {
+    return cast<llvm::GlobalVariable>(Val)->getAlign();
+  }
+
+  // TODO: Add missing: setAligment(Align)
+
+  /// Sets the alignment attribute of the GlobalVariable.
+  /// This method will be deprecated as the alignment property should always be
+  /// defined.
+  void setAlignment(MaybeAlign Align);
+
   // TODO: Missing setCodeModel(). Requires custom tracker.
 
 #ifndef NDEBUG
diff --git a/llvm/include/llvm/SandboxIR/Function.h b/llvm/include/llvm/SandboxIR/Function.h
index a810533..2c4b53e 100644
--- a/llvm/include/llvm/SandboxIR/Function.h
+++ b/llvm/include/llvm/SandboxIR/Function.h
@@ -58,6 +58,16 @@ public:
   }
   FunctionType *getFunctionType() const;
 
+  /// Returns the alignment of the given function.
+  MaybeAlign getAlign() const { return cast<llvm::Function>(Val)->getAlign(); }
+
+  // TODO: Add missing: setAligment(Align)
+
+  /// Sets the alignment attribute of the Function.
+  /// This method will be deprecated as the alignment property should always be
+  /// defined.
+  void setAlignment(MaybeAlign Align);
+
 #ifndef NDEBUG
   void verify() const final {
     assert(isa<llvm::Function>(Val) && "Expected Function!");
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index d56216e..5123bbe 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -1061,7 +1061,9 @@ class OutPatFrag<dag ops, dag frag>
 // PatLeaf's are pattern fragments that have no operands.  This is just a helper
 // to define immediates and other common things concisely.
 class PatLeaf<dag frag, code pred = [{}], SDNodeXForm xform = NOOP_SDNodeXForm>
- : PatFrag<(ops), frag, pred, xform>;
+ : PatFrag<(ops), frag, pred, xform> {
+  code GISelLeafPredicateCode = ?;
+}
 
 
 // ImmLeaf is a pattern fragment with a constraint on the immediate.  The
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 7fd5278..351da0d6 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -917,6 +917,34 @@ public:
            isOSBinFormatELF();
   }
 
+  // ARM EABI is the bare-metal EABI described in ARM ABI documents and
+  // can be accessed via -target arm-none-eabi. This is NOT GNUEABI.
+  // FIXME: Add a flag for bare-metal for that target and set Triple::EABI
+  // even for GNUEABI, so we can make a distinction here and still conform to
+  // the EABI on GNU (and Android) mode. This requires change in Clang, too.
+  // FIXME: The Darwin exception is temporary, while we move users to
+  // "*-*-*-macho" triples as quickly as possible.
+  bool isTargetAEABI() const {
+    return (getEnvironment() == Triple::EABI ||
+            getEnvironment() == Triple::EABIHF) &&
+           !isOSDarwin() && !isOSWindows();
+  }
+
+  bool isTargetGNUAEABI() const {
+    return (getEnvironment() == Triple::GNUEABI ||
+            getEnvironment() == Triple::GNUEABIT64 ||
+            getEnvironment() == Triple::GNUEABIHF ||
+            getEnvironment() == Triple::GNUEABIHFT64) &&
+           !isOSDarwin() && !isOSWindows();
+  }
+
+  bool isTargetMuslAEABI() const {
+    return (getEnvironment() == Triple::MuslEABI ||
+            getEnvironment() == Triple::MuslEABIHF ||
+            getEnvironment() == Triple::OpenHOS) &&
+           !isOSDarwin() && !isOSWindows();
+  }
+
   /// Tests whether the target is T32.
   bool isArmT32() const {
     switch (getSubArch()) {
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 7dd7f41..23ea696 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1672,6 +1672,7 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::sincos:
   case Intrinsic::sinh:
   case Intrinsic::cosh:
+  case Intrinsic::atan:
   case Intrinsic::pow:
   case Intrinsic::powi:
   case Intrinsic::ldexp:
@@ -2538,6 +2539,8 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
         return ConstantFoldFP(sinh, APF, Ty);
       case Intrinsic::cosh:
         return ConstantFoldFP(cosh, APF, Ty);
+      case Intrinsic::atan:
+        return ConstantFoldFP(atan, APF, Ty);
       case Intrinsic::sqrt:
         return ConstantFoldFP(sqrt, APF, Ty);
       case Intrinsic::amdgcn_cos:
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index c7baf28..00cdb66 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1622,7 +1622,7 @@ bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
   std::optional<int64_t> Diff =
       getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE,
                       /*StrictCheck=*/true, CheckType);
-  return Diff && *Diff == 1;
+  return Diff == 1;
 }
 
 void MemoryDepChecker::addAccess(StoreInst *SI) {
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index d6bb852..d8c1096 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -8660,6 +8660,7 @@ static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred,
     if (LHSSafe && RHSSafe) {
       // Both operands are known non-NaN.
       NaNBehavior = SPNB_RETURNS_ANY;
+      Ordered = CmpInst::isOrdered(Pred);
     } else if (CmpInst::isOrdered(Pred)) {
       // An ordered comparison will return false when given a NaN, so it
       // returns the RHS.
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 23db009..b933d24 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -64,8 +64,6 @@ static cl::opt<bool> AllowIncompleteIR(
         "Allow incomplete IR on a best effort basis (references to unknown "
         "metadata will be dropped)"));
 
-LLVM_ABI extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
-
 static std::string getTypeString(Type *T) {
   std::string Result;
   raw_string_ostream Tmp(Result);
@@ -443,7 +441,7 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
   UpgradeNVVMAnnotations(*M);
   UpgradeSectionAttributes(*M);
 
-  M->setIsNewDbgInfoFormat(UseNewDbgInfoFormat);
+  M->setIsNewDbgInfoFormat(true);
 
   if (!Slots)
     return false;
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 105edb9..31129b7 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -101,8 +101,6 @@ static cl::opt<bool> ExpandConstantExprs(
     cl::desc(
         "Expand constant expressions to instructions for testing purposes"));
 
-LLVM_ABI extern cl::opt<bool> UseNewDbgInfoFormat;
-
 namespace {
 
 enum {
@@ -4481,9 +4479,9 @@ Error BitcodeReader::parseGlobalIndirectSymbolRecord(
 Error BitcodeReader::parseModule(uint64_t ResumeBit,
                                  bool ShouldLazyLoadMetadata,
                                  ParserCallbacks Callbacks) {
-  // In preparation for the deletion of debug-intrinsics, don't allow module
-  // loading to escape intrinsics being autoupgraded to debug records.
-  TheModule->IsNewDbgInfoFormat = UseNewDbgInfoFormat;
+  // Don't allow modules to use debug-intrinsics: autoupgrading them is now
+  // mandatory.
+  TheModule->IsNewDbgInfoFormat = true;
 
   this->ValueTypeCallback = std::move(Callbacks.ValueType);
   if (ResumeBit) {
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index fad8ebf..628b939 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -122,8 +122,6 @@ namespace llvm {
 extern FunctionSummary::ForceSummaryHotnessType ForceSummaryEdgesCold;
 }
 
-LLVM_ABI extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
-
 namespace {
 
 /// These are manifest constants used by the bitcode writer. They do not need to
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index bcfc64c6..e13e923 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -369,7 +369,11 @@ Align AsmPrinter::getGVAlignment(const GlobalObject *GV, const DataLayout &DL,
     Alignment = InAlign;
 
   // If the GV has a specified alignment, take it into account.
-  const MaybeAlign GVAlign(GV->getAlign());
+  MaybeAlign GVAlign;
+  if (auto *GVar = dyn_cast<GlobalVariable>(GV))
+    GVAlign = GVar->getAlign();
+  else if (auto *F = dyn_cast<Function>(GV))
+    GVAlign = F->getAlign();
   if (!GVAlign)
     return Alignment;
 
diff --git a/llvm/lib/CodeGen/IndirectBrExpandPass.cpp b/llvm/lib/CodeGen/IndirectBrExpandPass.cpp
index 15f2420..84f3858 100644
--- a/llvm/lib/CodeGen/IndirectBrExpandPass.cpp
+++ b/llvm/lib/CodeGen/IndirectBrExpandPass.cpp
@@ -144,8 +144,8 @@ bool runImpl(Function &F, const TargetLowering *TLI, DomTreeUpdater *DTU) {
     if (BlockAddressUseIt == BB.use_end())
       continue;
 
-    assert(std::find_if(std::next(BlockAddressUseIt), BB.use_end(),
-                        IsBlockAddressUse) == BB.use_end() &&
+    assert(std::none_of(std::next(BlockAddressUseIt), BB.use_end(),
+                        IsBlockAddressUse) &&
            "There should only ever be a single blockaddress use because it is "
            "a constant and should be uniqued.");
 
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index b7b8f66..34ac079 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -70,8 +70,6 @@ static cl::opt<bool> SimplifyMIR(
 static cl::opt<bool> PrintLocations("mir-debug-loc", cl::Hidden, cl::init(true),
                                     cl::desc("Print MIR debug-locations"));
 
-extern cl::opt<bool> UseNewDbgInfoFormat;
-
 namespace {
 
 /// This structure describes how to print out stack object references.
@@ -967,8 +965,7 @@ void MIRFormatter::printIRValue(raw_ostream &OS, const Value &V,
 }
 
 void llvm::printMIR(raw_ostream &OS, const Module &M) {
-  ScopedDbgInfoFormatSetter FormatSetter(const_cast<Module &>(M),
-                                         UseNewDbgInfoFormat);
+  ScopedDbgInfoFormatSetter FormatSetter(const_cast<Module &>(M), true);
 
   yaml::Output Out(OS);
   Out << const_cast<Module &>(M);
@@ -979,6 +976,6 @@ void llvm::printMIR(raw_ostream &OS, const MachineModuleInfo &MMI,
   // RemoveDIs: as there's no textual form for DbgRecords yet, print debug-info
   // in dbg.value format.
   ScopedDbgInfoFormatSetter FormatSetter(
-      const_cast<Function &>(MF.getFunction()), UseNewDbgInfoFormat);
+      const_cast<Function &>(MF.getFunction()), true);
   printMF(OS, MMI, MF);
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index eff1528..b0da536 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1992,6 +1992,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM: return visitMHISTOGRAM(N);
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_UMLA:
+  case ISD::PARTIAL_REDUCE_SUMLA:
                                 return visitPARTIAL_REDUCE_MLA(N);
   case ISD::VECTOR_COMPRESS:    return visitVECTOR_COMPRESS(N);
   case ISD::LIFETIME_END:       return visitLIFETIME_END(N);
@@ -8127,6 +8128,59 @@ static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
   return SDValue();
 }
 
+static SDValue foldMaskedMergeImpl(SDValue AndL0, SDValue AndR0, SDValue AndL1,
+                                   SDValue AndR1, const SDLoc &DL,
+                                   SelectionDAG &DAG) {
+  if (!isBitwiseNot(AndL0, true) || !AndL0->hasOneUse())
+    return SDValue();
+  SDValue NotOp = AndL0->getOperand(0);
+  if (NotOp == AndR1)
+    std::swap(AndR1, AndL1);
+  if (NotOp != AndL1)
+    return SDValue();
+
+  EVT VT = AndL1->getValueType(0);
+  SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, AndR1, AndR0);
+  SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
+  SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, AndR0);
+  return Xor1;
+}
+
+/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
+/// equivalent `((x ^ y) & m) ^ y)` pattern.
+/// This is typically a better representation for targets without a fused
+/// "and-not" operation.
+static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG,
+                               const TargetLowering &TLI, const SDLoc &DL) {
+  // Note that masked-merge variants using XOR or ADD expressions are
+  // normalized to OR by InstCombine so we only check for OR.
+  assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
+  SDValue N0 = Node->getOperand(0);
+  if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
+    return SDValue();
+  SDValue N1 = Node->getOperand(1);
+  if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
+    return SDValue();
+
+  // If the target supports and-not, don't fold this.
+  if (TLI.hasAndNot(SDValue(Node, 0)))
+    return SDValue();
+
+  SDValue N00 = N0->getOperand(0);
+  SDValue N01 = N0->getOperand(1);
+  SDValue N10 = N1->getOperand(0);
+  SDValue N11 = N1->getOperand(1);
+  if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
+    return Result;
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -8305,6 +8359,10 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
     if (SDValue R = foldLogicTreeOfShifts(N, N0, N1, DAG))
       return R;
 
+  if (VT.isScalarInteger() && VT != MVT::i1)
+    if (SDValue R = foldMaskedMerge(N, DAG, TLI, DL))
+      return R;
+
   return SDValue();
 }
 
@@ -12680,26 +12738,27 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   SDValue LHSExtOp = LHS->getOperand(0);
   EVT LHSExtOpVT = LHSExtOp.getValueType();
 
-  bool ExtIsSigned = LHSOpcode == ISD::SIGN_EXTEND;
-  unsigned NewOpcode =
-      ExtIsSigned ? ISD::PARTIAL_REDUCE_SMLA : ISD::PARTIAL_REDUCE_UMLA;
-
-  // Only perform these combines if the target supports folding
-  // the extends into the operation.
-  if (!TLI.isPartialReduceMLALegalOrCustom(
-          NewOpcode, TLI.getTypeToTransformTo(*Context, N->getValueType(0)),
-          TLI.getTypeToTransformTo(*Context, LHSExtOpVT)))
-    return SDValue();
-
   // partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1))
   // -> partial_reduce_*mla(acc, x, C)
   if (ISD::isConstantSplatVector(RHS.getNode(), C)) {
+    // TODO: Make use of partial_reduce_sumla here
     APInt CTrunc = C.trunc(LHSExtOpVT.getScalarSizeInBits());
     unsigned LHSBits = LHS.getValueType().getScalarSizeInBits();
     if ((LHSOpcode != ISD::ZERO_EXTEND || CTrunc.zext(LHSBits) != C) &&
         (LHSOpcode != ISD::SIGN_EXTEND || CTrunc.sext(LHSBits) != C))
       return SDValue();
 
+    unsigned NewOpcode = LHSOpcode == ISD::SIGN_EXTEND
+                             ? ISD::PARTIAL_REDUCE_SMLA
+                             : ISD::PARTIAL_REDUCE_UMLA;
+
+    // Only perform these combines if the target supports folding
+    // the extends into the operation.
+    if (!TLI.isPartialReduceMLALegalOrCustom(
+            NewOpcode, TLI.getTypeToTransformTo(*Context, N->getValueType(0)),
+            TLI.getTypeToTransformTo(*Context, LHSExtOpVT)))
+      return SDValue();
+
     return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, LHSExtOp,
                        DAG.getConstant(CTrunc, DL, LHSExtOpVT));
   }
@@ -12709,26 +12768,46 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
     return SDValue();
 
   SDValue RHSExtOp = RHS->getOperand(0);
-  if (LHSExtOpVT != RHSExtOp.getValueType() || LHSOpcode != RHSOpcode)
+  if (LHSExtOpVT != RHSExtOp.getValueType())
+    return SDValue();
+
+  unsigned NewOpc;
+  if (LHSOpcode == ISD::SIGN_EXTEND && RHSOpcode == ISD::SIGN_EXTEND)
+    NewOpc = ISD::PARTIAL_REDUCE_SMLA;
+  else if (LHSOpcode == ISD::ZERO_EXTEND && RHSOpcode == ISD::ZERO_EXTEND)
+    NewOpc = ISD::PARTIAL_REDUCE_UMLA;
+  else if (LHSOpcode == ISD::SIGN_EXTEND && RHSOpcode == ISD::ZERO_EXTEND)
+    NewOpc = ISD::PARTIAL_REDUCE_SUMLA;
+  else if (LHSOpcode == ISD::ZERO_EXTEND && RHSOpcode == ISD::SIGN_EXTEND) {
+    NewOpc = ISD::PARTIAL_REDUCE_SUMLA;
+    std::swap(LHSExtOp, RHSExtOp);
+  } else
     return SDValue();
-
-  // For a 2-stage extend the signedness of both of the extends must be the
-  // same. This is so the node can be folded into only a signed or unsigned
-  // node.
-  bool NodeIsSigned = N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA;
+  // For a 2-stage extend the signedness of both of the extends must match
+  // If the mul has the same type, there is no outer extend, and thus we
+  // can simply use the inner extends to pick the result node.
+  // TODO: extend to handle nonneg zext as sext
   EVT AccElemVT = Acc.getValueType().getVectorElementType();
-  if (ExtIsSigned != NodeIsSigned &&
-      Op1.getValueType().getVectorElementType() != AccElemVT)
+  if (Op1.getValueType().getVectorElementType() != AccElemVT &&
+      NewOpc != N->getOpcode())
     return SDValue();
 
-  return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, LHSExtOp,
-                     RHSExtOp);
+  // Only perform these combines if the target supports folding
+  // the extends into the operation.
+  if (!TLI.isPartialReduceMLALegalOrCustom(
+          NewOpc, TLI.getTypeToTransformTo(*Context, N->getValueType(0)),
+          TLI.getTypeToTransformTo(*Context, LHSExtOpVT)))
+    return SDValue();
+
+  return DAG.getNode(NewOpc, DL, N->getValueType(0), Acc, LHSExtOp, RHSExtOp);
 }
 
 // partial.reduce.umla(acc, zext(op), splat(1))
 // -> partial.reduce.umla(acc, op, splat(trunc(1)))
 // partial.reduce.smla(acc, sext(op), splat(1))
 // -> partial.reduce.smla(acc, op, splat(trunc(1)))
+// partial.reduce.sumla(acc, sext(op), splat(1))
+// -> partial.reduce.smla(acc, op, splat(trunc(1)))
 SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
   SDLoc DL(N);
   SDValue Acc = N->getOperand(0);
@@ -12745,7 +12824,7 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
     return SDValue();
 
   bool Op1IsSigned = Op1Opcode == ISD::SIGN_EXTEND;
-  bool NodeIsSigned = N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA;
+  bool NodeIsSigned = N->getOpcode() != ISD::PARTIAL_REDUCE_UMLA;
   EVT AccElemVT = Acc.getValueType().getVectorElementType();
   if (Op1IsSigned != NodeIsSigned &&
       Op1.getValueType().getVectorElementType() != AccElemVT)
@@ -14933,6 +15012,25 @@ SDValue DAGCombiner::visitAssertExt(SDNode *N) {
     }
   }
 
+  // If we have (AssertZext (and (AssertSext X, iX), M), iY) and Y is smaller
+  // than X, and the And doesn't change the lower iX bits, we can move the
+  // AssertZext in front of the And and drop the AssertSext.
+  if (Opcode == ISD::AssertZext && N0.getOpcode() == ISD::AND &&
+      N0.hasOneUse() && N0.getOperand(0).getOpcode() == ISD::AssertSext &&
+      isa<ConstantSDNode>(N0.getOperand(1))) {
+    SDValue BigA = N0.getOperand(0);
+    EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
+    const APInt &Mask = N0.getConstantOperandAPInt(1);
+    if (AssertVT.bitsLT(BigA_AssertVT) &&
+        Mask.countr_one() >= BigA_AssertVT.getScalarSizeInBits()) {
+      SDLoc DL(N);
+      SDValue NewAssert =
+          DAG.getNode(Opcode, DL, N->getValueType(0), BigA.getOperand(0), N1);
+      return DAG.getNode(ISD::AND, DL, N->getValueType(0), NewAssert,
+                         N0.getOperand(1));
+    }
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 5fe15e4..f5f4d71 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3562,9 +3562,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     unsigned Factor = Node->getNumOperands();
     if (Factor <= 2 || !isPowerOf2_32(Factor))
       break;
-    SmallVector<SDValue, 8> Ops;
-    for (SDValue Op : Node->ops())
-      Ops.push_back(Op);
+    SmallVector<SDValue, 8> Ops(Node->ops());
     EVT VecVT = Node->getValueType(0);
     SmallVector<EVT> HalfVTs(Factor / 2, VecVT);
     // Deinterleave at Factor/2 so each result contains two factors interleaved:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 2bcca91..dd64676 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -166,6 +166,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
 
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
+  case ISD::PARTIAL_REDUCE_SUMLA:
     Res = PromoteIntRes_PARTIAL_REDUCE_MLA(N);
     break;
 
@@ -2093,6 +2094,7 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
     break;
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
+  case ISD::PARTIAL_REDUCE_SUMLA:
     Res = PromoteIntOp_PARTIAL_REDUCE_MLA(N);
     break;
   }
@@ -2886,12 +2888,21 @@ SDValue DAGTypeLegalizer::PromoteIntOp_GET_ACTIVE_LANE_MASK(SDNode *N) {
 
 SDValue DAGTypeLegalizer::PromoteIntOp_PARTIAL_REDUCE_MLA(SDNode *N) {
   SmallVector<SDValue, 1> NewOps(N->ops());
-  if (N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA) {
+  switch (N->getOpcode()) {
+  case ISD::PARTIAL_REDUCE_SMLA:
     NewOps[1] = SExtPromotedInteger(N->getOperand(1));
     NewOps[2] = SExtPromotedInteger(N->getOperand(2));
-  } else {
+    break;
+  case ISD::PARTIAL_REDUCE_UMLA:
     NewOps[1] = ZExtPromotedInteger(N->getOperand(1));
     NewOps[2] = ZExtPromotedInteger(N->getOperand(2));
+    break;
+  case ISD::PARTIAL_REDUCE_SUMLA:
+    NewOps[1] = SExtPromotedInteger(N->getOperand(1));
+    NewOps[2] = ZExtPromotedInteger(N->getOperand(2));
+    break;
+  default:
+    llvm_unreachable("unexpected opcode");
   }
   return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 910a40e..4a1cd64 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -530,6 +530,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   }
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
+  case ISD::PARTIAL_REDUCE_SUMLA:
     Action =
         TLI.getPartialReduceMLAAction(Op.getOpcode(), Node->getValueType(0),
                                       Node->getOperand(1).getValueType());
@@ -1211,6 +1212,7 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
     return;
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
+  case ISD::PARTIAL_REDUCE_SUMLA:
     Results.push_back(TLI.expandPartialReduceMLA(Node, DAG));
     return;
   case ISD::VECREDUCE_SEQ_FADD:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 5582dc9..f63fe17 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1387,6 +1387,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
     break;
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
+  case ISD::PARTIAL_REDUCE_SUMLA:
     SplitVecRes_PARTIAL_REDUCE_MLA(N, Lo, Hi);
     break;
   case ISD::GET_ACTIVE_LANE_MASK:
@@ -3473,6 +3474,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
     break;
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
+  case ISD::PARTIAL_REDUCE_SUMLA:
     Res = SplitVecOp_PARTIAL_REDUCE_MLA(N);
     break;
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 279c7da..049c242 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7981,7 +7981,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     break;
   }
   case ISD::PARTIAL_REDUCE_UMLA:
-  case ISD::PARTIAL_REDUCE_SMLA: {
+  case ISD::PARTIAL_REDUCE_SMLA:
+  case ISD::PARTIAL_REDUCE_SUMLA: {
     [[maybe_unused]] EVT AccVT = N1.getValueType();
     [[maybe_unused]] EVT Input1VT = N2.getValueType();
     [[maybe_unused]] EVT Input2VT = N3.getValueType();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 2460c86..e6a1dc9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7384,6 +7384,11 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     setValue(&I, getValue(I.getOperand(0)));
     return;
 
+  case Intrinsic::type_test:
+  case Intrinsic::public_type_test:
+    setValue(&I, getValue(ConstantInt::getTrue(I.getType())));
+    return;
+
   case Intrinsic::assume:
   case Intrinsic::experimental_noalias_scope_decl:
   case Intrinsic::var_annotation:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 539f583..7fc1558 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -585,6 +585,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
     return "partial_reduce_umla";
   case ISD::PARTIAL_REDUCE_SMLA:
     return "partial_reduce_smla";
+  case ISD::PARTIAL_REDUCE_SUMLA:
+    return "partial_reduce_sumla";
 
     // Vector Predication
 #define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...)                    \
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e8e820a..a0ffb4b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -429,8 +429,16 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
     // Update Chain.
     Chain = Call.second;
   } else {
+    assert(CCCode == (ShouldInvertCC ? ISD::SETEQ : ISD::SETNE) &&
+           "unordered call should be simple boolean");
+
     EVT SetCCVT =
         getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), RetVT);
+    if (getBooleanContents(RetVT) == ZeroOrOneBooleanContent) {
+      NewLHS = DAG.getNode(ISD::AssertZext, dl, RetVT, Call.first,
+                           DAG.getValueType(MVT::i1));
+    }
+
     SDValue Tmp = DAG.getSetCC(dl, SetCCVT, NewLHS, NewRHS, CCCode);
     auto Call2 = makeLibCall(DAG, LC2, RetVT, Ops, CallOptions, dl, Chain);
     CCCode = getCmpLibcallCC(LC2);
@@ -11891,13 +11899,17 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
   EVT ExtMulOpVT =
       EVT::getVectorVT(*DAG.getContext(), AccVT.getVectorElementType(),
                        MulOpVT.getVectorElementCount());
-  unsigned ExtOpc = N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA
-                        ? ISD::SIGN_EXTEND
-                        : ISD::ZERO_EXTEND;
+
+  unsigned ExtOpcLHS = N->getOpcode() == ISD::PARTIAL_REDUCE_UMLA
+                      ? ISD::ZERO_EXTEND
+                      : ISD::SIGN_EXTEND;
+  unsigned ExtOpcRHS = N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA
+                      ? ISD::SIGN_EXTEND
+                      : ISD::ZERO_EXTEND;
 
   if (ExtMulOpVT != MulOpVT) {
-    MulLHS = DAG.getNode(ExtOpc, DL, ExtMulOpVT, MulLHS);
-    MulRHS = DAG.getNode(ExtOpc, DL, ExtMulOpVT, MulRHS);
+    MulLHS = DAG.getNode(ExtOpcLHS, DL, ExtMulOpVT, MulLHS);
+    MulRHS = DAG.getNode(ExtOpcRHS, DL, ExtMulOpVT, MulRHS);
   }
   SDValue Input = MulLHS;
   APInt ConstantOne;
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFCFIProgram.cpp b/llvm/lib/DebugInfo/DWARF/DWARFCFIProgram.cpp
index 1a4fc49..8d25599 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFCFIProgram.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFCFIProgram.cpp
@@ -425,7 +425,8 @@ void CFIProgram::printOperand(raw_ostream &OS, DIDumpOptions DumpOpts,
   case OT_Expression:
     assert(Instr.Expression && "missing DWARFExpression object");
     OS << " ";
-    Instr.Expression->print(OS, DumpOpts, nullptr);
+    DWARFExpressionPrinter::print(&Instr.Expression.value(), OS, DumpOpts,
+                                  nullptr);
     break;
   }
 }
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
index aecfc45..c46b14b 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -14,6 +14,7 @@
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFCFIProgram.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+#include "llvm/DebugInfo/DWARF/DWARFExpression.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Errc.h"
@@ -110,7 +111,8 @@ void UnwindLocation::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
       OS << " in addrspace" << *AddrSpace;
     break;
   case DWARFExpr: {
-    Expr->print(OS, DumpOpts, nullptr);
+    if (Expr)
+      DWARFExpressionPrinter::print(&(*Expr), OS, DumpOpts, nullptr);
     break;
   }
   case Constant:
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
index ec7af79..fc71be3 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -116,7 +116,8 @@ static void dumpExpression(raw_ostream &OS, DIDumpOptions DumpOpts,
   std::optional<dwarf::DwarfFormat> Format;
   if (U)
     Format = U->getFormat();
-  DWARFExpression(Extractor, AddressSize, Format).print(OS, DumpOpts, U);
+  DWARFExpression E(Extractor, AddressSize, Format);
+  DWARFExpressionPrinter::print(&E, OS, DumpOpts, U);
 }
 
 bool DWARFLocationTable::dumpLocationList(
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index a0ce781..08dd9d3 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -98,8 +98,8 @@ static void dumpLocationExpr(raw_ostream &OS, const DWARFFormValue &FormValue,
   ArrayRef<uint8_t> Expr = *FormValue.getAsBlock();
   DataExtractor Data(StringRef((const char *)Expr.data(), Expr.size()),
                      Ctx.isLittleEndian(), 0);
-  DWARFExpression(Data, U->getAddressByteSize(), U->getFormParams().Format)
-      .print(OS, DumpOpts, U);
+  DWARFExpression DE(Data, U->getAddressByteSize(), U->getFormParams().Format);
+  DWARFExpressionPrinter::print(&DE, OS, DumpOpts, U);
 }
 
 static DWARFDie resolveReferencedType(DWARFDie D, DWARFFormValue F) {
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp b/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp
index 3f6695e..8255e01 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp
@@ -239,10 +239,23 @@ bool DWARFExpression::Operation::extract(DataExtractor Data,
   return true;
 }
 
-static void prettyPrintBaseTypeRef(DWARFUnit *U, raw_ostream &OS,
-                                   DIDumpOptions DumpOpts,
-                                   ArrayRef<uint64_t> Operands,
-                                   unsigned Operand) {
+std::optional<unsigned> DWARFExpression::Operation::getSubCode() const {
+  if (!Desc.Op.size() || Desc.Op[0] != Operation::SizeSubOpLEB)
+    return std::nullopt;
+  return Operands[0];
+}
+
+bool DWARFExpression::operator==(const DWARFExpression &RHS) const {
+  if (AddressSize != RHS.AddressSize || Format != RHS.Format)
+    return false;
+  return Data.getData() == RHS.Data.getData();
+}
+
+void DWARFExpressionPrinter::prettyPrintBaseTypeRef(DWARFUnit *U,
+                                                    raw_ostream &OS,
+                                                    DIDumpOptions DumpOpts,
+                                                    ArrayRef<uint64_t> Operands,
+                                                    unsigned Operand) {
   assert(Operand < Operands.size() && "operand out of bounds");
   if (!U) {
     OS << format(" <base_type ref: 0x%" PRIx64 ">", Operands[Operand]);
@@ -261,10 +274,9 @@ static void prettyPrintBaseTypeRef(DWARFUnit *U, raw_ostream &OS,
   }
 }
 
-bool DWARFExpression::prettyPrintRegisterOp(DWARFUnit *U, raw_ostream &OS,
-                                            DIDumpOptions DumpOpts,
-                                            uint8_t Opcode,
-                                            ArrayRef<uint64_t> Operands) {
+bool DWARFExpressionPrinter::prettyPrintRegisterOp(
+    DWARFUnit *U, raw_ostream &OS, DIDumpOptions DumpOpts, uint8_t Opcode,
+    ArrayRef<uint64_t> Operands) {
   if (!DumpOpts.GetNameForDWARFReg)
     return false;
 
@@ -295,87 +307,84 @@ bool DWARFExpression::prettyPrintRegisterOp(DWARFUnit *U, raw_ostream &OS,
   return false;
 }
 
-std::optional<unsigned> DWARFExpression::Operation::getSubCode() const {
-  if (!Desc.Op.size() || Desc.Op[0] != Operation::SizeSubOpLEB)
-    return std::nullopt;
-  return Operands[0];
-}
-
-bool DWARFExpression::Operation::print(raw_ostream &OS, DIDumpOptions DumpOpts,
-                                       const DWARFExpression *Expr,
-                                       DWARFUnit *U) const {
-  if (Error) {
+bool DWARFExpressionPrinter::printOp(const DWARFExpression::Operation *Op,
+                                     raw_ostream &OS, DIDumpOptions DumpOpts,
+                                     const DWARFExpression *Expr,
+                                     DWARFUnit *U) {
+  if (Op->Error) {
     OS << "<decoding error>";
     return false;
   }
 
-  StringRef Name = OperationEncodingString(Opcode);
+  StringRef Name = OperationEncodingString(Op->Opcode);
   assert(!Name.empty() && "DW_OP has no name!");
   OS << Name;
 
-  if ((Opcode >= DW_OP_breg0 && Opcode <= DW_OP_breg31) ||
-      (Opcode >= DW_OP_reg0 && Opcode <= DW_OP_reg31) ||
-      Opcode == DW_OP_bregx || Opcode == DW_OP_regx ||
-      Opcode == DW_OP_regval_type)
-    if (prettyPrintRegisterOp(U, OS, DumpOpts, Opcode, Operands))
+  if ((Op->Opcode >= DW_OP_breg0 && Op->Opcode <= DW_OP_breg31) ||
+      (Op->Opcode >= DW_OP_reg0 && Op->Opcode <= DW_OP_reg31) ||
+      Op->Opcode == DW_OP_bregx || Op->Opcode == DW_OP_regx ||
+      Op->Opcode == DW_OP_regval_type)
+    if (prettyPrintRegisterOp(U, OS, DumpOpts, Op->Opcode, Op->Operands))
       return true;
 
-  for (unsigned Operand = 0; Operand < Desc.Op.size(); ++Operand) {
-    unsigned Size = Desc.Op[Operand];
-    unsigned Signed = Size & Operation::SignBit;
+  for (unsigned Operand = 0; Operand < Op->Desc.Op.size(); ++Operand) {
+    unsigned Size = Op->Desc.Op[Operand];
+    unsigned Signed = Size & DWARFExpression::Operation::SignBit;
 
-    if (Size == Operation::SizeSubOpLEB) {
-      StringRef SubName = SubOperationEncodingString(Opcode, Operands[Operand]);
+    if (Size == DWARFExpression::Operation::SizeSubOpLEB) {
+      StringRef SubName =
+          SubOperationEncodingString(Op->Opcode, Op->Operands[Operand]);
       assert(!SubName.empty() && "DW_OP SubOp has no name!");
       OS << " " << SubName;
-    } else if (Size == Operation::BaseTypeRef && U) {
+    } else if (Size == DWARFExpression::Operation::BaseTypeRef && U) {
       // For DW_OP_convert the operand may be 0 to indicate that conversion to
       // the generic type should be done. The same holds for DW_OP_reinterpret,
       // which is currently not supported.
-      if (Opcode == DW_OP_convert && Operands[Operand] == 0)
+      if (Op->Opcode == DW_OP_convert && Op->Operands[Operand] == 0)
         OS << " 0x0";
       else
-        prettyPrintBaseTypeRef(U, OS, DumpOpts, Operands, Operand);
-    } else if (Size == Operation::WasmLocationArg) {
+        prettyPrintBaseTypeRef(U, OS, DumpOpts, Op->Operands, Operand);
+    } else if (Size == DWARFExpression::Operation::WasmLocationArg) {
       assert(Operand == 1);
-      switch (Operands[0]) {
+      switch (Op->Operands[0]) {
       case 0:
       case 1:
       case 2:
       case 3: // global as uint32
       case 4:
-        OS << format(" 0x%" PRIx64, Operands[Operand]);
+        OS << format(" 0x%" PRIx64, Op->Operands[Operand]);
         break;
       default: assert(false);
       }
-    } else if (Size == Operation::SizeBlock) {
-      uint64_t Offset = Operands[Operand];
-      for (unsigned i = 0; i < Operands[Operand - 1]; ++i)
+    } else if (Size == DWARFExpression::Operation::SizeBlock) {
+      uint64_t Offset = Op->Operands[Operand];
+      for (unsigned i = 0; i < Op->Operands[Operand - 1]; ++i)
         OS << format(" 0x%02x", Expr->Data.getU8(&Offset));
     } else {
       if (Signed)
-        OS << format(" %+" PRId64, (int64_t)Operands[Operand]);
-      else if (Opcode != DW_OP_entry_value &&
-               Opcode != DW_OP_GNU_entry_value)
-        OS << format(" 0x%" PRIx64, Operands[Operand]);
+        OS << format(" %+" PRId64, (int64_t)Op->Operands[Operand]);
+      else if (Op->Opcode != DW_OP_entry_value &&
+               Op->Opcode != DW_OP_GNU_entry_value)
+        OS << format(" 0x%" PRIx64, Op->Operands[Operand]);
     }
   }
   return true;
 }
 
-void DWARFExpression::print(raw_ostream &OS, DIDumpOptions DumpOpts,
-                            DWARFUnit *U, bool IsEH) const {
+void DWARFExpressionPrinter::print(const DWARFExpression *E, raw_ostream &OS,
+                                   DIDumpOptions DumpOpts, DWARFUnit *U,
+                                   bool IsEH) {
   uint32_t EntryValExprSize = 0;
   uint64_t EntryValStartOffset = 0;
-  if (Data.getData().empty())
+  if (E->Data.getData().empty())
     OS << "<empty>";
 
-  for (auto &Op : *this) {
+  for (auto &Op : *E) {
     DumpOpts.IsEH = IsEH;
-    if (!Op.print(OS, DumpOpts, this, U)) {
+    if (!printOp(&Op, OS, DumpOpts, E, U)) {
       uint64_t FailOffset = Op.getEndOffset();
-      while (FailOffset < Data.getData().size())
-        OS << format(" %02x", Data.getU8(&FailOffset));
+      while (FailOffset < E->Data.getData().size())
+        OS << format(" %02x", E->Data.getU8(&FailOffset));
       return;
     }
 
@@ -393,39 +402,11 @@ void DWARFExpression::print(raw_ostream &OS, DIDumpOptions DumpOpts,
         OS << ")";
     }
 
-    if (Op.getEndOffset() < Data.getData().size())
+    if (Op.getEndOffset() < E->Data.getData().size())
       OS << ", ";
   }
 }
 
-bool DWARFExpression::Operation::verify(const Operation &Op, DWARFUnit *U) {
-  for (unsigned Operand = 0; Operand < Op.Desc.Op.size(); ++Operand) {
-    unsigned Size = Op.Desc.Op[Operand];
-
-    if (Size == Operation::BaseTypeRef) {
-      // For DW_OP_convert the operand may be 0 to indicate that conversion to
-      // the generic type should be done, so don't look up a base type in that
-      // case. The same holds for DW_OP_reinterpret, which is currently not
-      // supported.
-      if (Op.Opcode == DW_OP_convert && Op.Operands[Operand] == 0)
-        continue;
-      auto Die = U->getDIEForOffset(U->getOffset() + Op.Operands[Operand]);
-      if (!Die || Die.getTag() != dwarf::DW_TAG_base_type)
-        return false;
-    }
-  }
-
-  return true;
-}
-
-bool DWARFExpression::verify(DWARFUnit *U) {
-  for (auto &Op : *this)
-    if (!Operation::verify(Op, U))
-      return false;
-
-  return true;
-}
-
 /// A user-facing string representation of a DWARF expression. This might be an
 /// Address expression, in which case it will be implicitly dereferenced, or a
 /// Value expression.
@@ -499,7 +480,7 @@ static bool printCompactDWARFExpr(
       break;
     }
     case dwarf::DW_OP_LLVM_user: {
-      assert(Op.getSubCode() && *Op.getSubCode() == dwarf::DW_OP_LLVM_nop);
+      assert(Op.getSubCode() == dwarf::DW_OP_LLVM_nop);
       break;
     }
     default:
@@ -548,16 +529,10 @@ static bool printCompactDWARFExpr(
   return true;
 }
 
-bool DWARFExpression::printCompact(
-    raw_ostream &OS,
+bool DWARFExpressionPrinter::printCompact(
+    const DWARFExpression *E, raw_ostream &OS,
     std::function<StringRef(uint64_t RegNum, bool IsEH)> GetNameForDWARFReg) {
-  return printCompactDWARFExpr(OS, begin(), end(), GetNameForDWARFReg);
-}
-
-bool DWARFExpression::operator==(const DWARFExpression &RHS) const {
-  if (AddressSize != RHS.AddressSize || Format != RHS.Format)
-    return false;
-  return Data.getData() == RHS.Data.getData();
+  return printCompactDWARFExpr(OS, E->begin(), E->end(), GetNameForDWARFReg);
 }
 
 } // namespace llvm
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 43a62bd..c12786c 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -659,6 +659,35 @@ unsigned DWARFVerifier::verifyDieRanges(const DWARFDie &Die,
   return NumErrors;
 }
 
+bool DWARFVerifier::verifyExpressionOp(const DWARFExpression::Operation &Op,
+                                       DWARFUnit *U) {
+  for (unsigned Operand = 0; Operand < Op.Desc.Op.size(); ++Operand) {
+    unsigned Size = Op.Desc.Op[Operand];
+
+    if (Size == DWARFExpression::Operation::BaseTypeRef) {
+      // For DW_OP_convert the operand may be 0 to indicate that conversion to
+      // the generic type should be done, so don't look up a base type in that
+      // case. The same holds for DW_OP_reinterpret, which is currently not
+      // supported.
+      if (Op.Opcode == DW_OP_convert && Op.Operands[Operand] == 0)
+        continue;
+      auto Die = U->getDIEForOffset(U->getOffset() + Op.Operands[Operand]);
+      if (!Die || Die.getTag() != dwarf::DW_TAG_base_type)
+        return false;
+    }
+  }
+
+  return true;
+}
+
+bool DWARFVerifier::verifyExpression(const DWARFExpression &E, DWARFUnit *U) {
+  for (auto &Op : E)
+    if (!verifyExpressionOp(Op, U))
+      return false;
+
+  return true;
+}
+
 unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
                                                  DWARFAttribute &AttrValue) {
   unsigned NumErrors = 0;
@@ -727,7 +756,7 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
             any_of(Expression, [](const DWARFExpression::Operation &Op) {
               return Op.isError();
             });
-        if (Error || !Expression.verify(U))
+        if (Error || !verifyExpression(Expression, U))
           ReportError("Invalid DWARF expressions",
                       "DIE contains invalid DWARF expression:");
       }
diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
index d5e6513..7ff96ae 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
@@ -592,8 +592,8 @@ std::string LVDWARFReader::getRegisterName(LVSmall Opcode,
     return {};
   };
   DumpOpts.GetNameForDWARFReg = GetRegName;
-  DWARFExpression::prettyPrintRegisterOp(/*U=*/nullptr, Stream, DumpOpts,
-                                         Opcode, Operands);
+  DWARFExpressionPrinter::prettyPrintRegisterOp(/*U=*/nullptr, Stream, DumpOpts,
+                                                Opcode, Operands);
   return Stream.str();
 }
 
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 628f0625..ed11ea0 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -31,30 +31,6 @@ using namespace llvm;
 #define DEBUG_TYPE "ir"
 STATISTIC(NumInstrRenumberings, "Number of renumberings across all blocks");
 
-// This cl-opt exists to control whether variable-location information is
-// produced using intrinsics, or whether DbgRecords are produced. However,
-// it's imminently being phased out, so give it a flag-name that is very
-// unlikely to be used anywhere.
-//
-// If you find yourself needing to use this flag for any period longer than
-// five minutes, please revert the patch making this change, and make contact
-// in this discourse post, where we can discuss any further transition work
-// that might be needed to remove debug intrinsics.
-//
-// https://discourse.llvm.org/t/psa-ir-output-changing-from-debug-intrinsics-to-debug-records/79578
-LLVM_ABI cl::opt<bool>
-    UseNewDbgInfoFormat("dont-pass-this-flag-please-experimental-debuginfo",
-                        cl::Hidden, cl::init(true));
-
-// This cl-opt collects the --experimental-debuginfo-iterators flag and then
-// does nothing with it (because the it gets stored into an otherwise unused
-// cl-opt), so that we can disable debug-intrinsic production without
-// immediately modifying lots of tests. If your tests break because of this
-// change, please see the next comment up.
-static cl::opt<bool> DeliberatelyUnseenDbgInfoFlag(
-    "experimental-debuginfo-iterators", cl::Hidden,
-    cl::init(true));
-
 DbgMarker *BasicBlock::createMarker(Instruction *I) {
   assert(IsNewDbgInfoFormat &&
          "Tried to create a marker in a non new debug-info block!");
@@ -187,7 +163,7 @@ template class llvm::SymbolTableListTraits<
 BasicBlock::BasicBlock(LLVMContext &C, const Twine &Name, Function *NewParent,
                        BasicBlock *InsertBefore)
     : Value(Type::getLabelTy(C), Value::BasicBlockVal),
-      IsNewDbgInfoFormat(UseNewDbgInfoFormat), Parent(nullptr) {
+      IsNewDbgInfoFormat(true), Parent(nullptr) {
 
   if (NewParent)
     insertInto(NewParent, InsertBefore);
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 1954b44..a7c3a56 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -2108,8 +2108,10 @@ LLVMTypeRef LLVMGlobalGetValueType(LLVMValueRef Global) {
 
 unsigned LLVMGetAlignment(LLVMValueRef V) {
   Value *P = unwrap(V);
-  if (GlobalObject *GV = dyn_cast<GlobalObject>(P))
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P))
     return GV->getAlign() ? GV->getAlign()->value() : 0;
+  if (Function *F = dyn_cast<Function>(P))
+    return F->getAlign() ? F->getAlign()->value() : 0;
   if (AllocaInst *AI = dyn_cast<AllocaInst>(P))
     return AI->getAlign().value();
   if (LoadInst *LI = dyn_cast<LoadInst>(P))
@@ -2128,8 +2130,10 @@ unsigned LLVMGetAlignment(LLVMValueRef V) {
 
 void LLVMSetAlignment(LLVMValueRef V, unsigned Bytes) {
   Value *P = unwrap(V);
-  if (GlobalObject *GV = dyn_cast<GlobalObject>(P))
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P))
     GV->setAlignment(MaybeAlign(Bytes));
+  else if (Function *F = dyn_cast<Function>(P))
+    F->setAlignment(MaybeAlign(Bytes));
   else if (AllocaInst *AI = dyn_cast<AllocaInst>(P))
     AI->setAlignment(Align(Bytes));
   else if (LoadInst *LI = dyn_cast<LoadInst>(P))
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index dfffbbf..63665d8 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -65,8 +65,6 @@ static cl::opt<int> NonGlobalValueMaxNameSize(
     "non-global-value-max-name-size", cl::Hidden, cl::init(1024),
     cl::desc("Maximum size for the name of non-global values."));
 
-LLVM_ABI extern cl::opt<bool> UseNewDbgInfoFormat;
-
 void Function::renumberBlocks() {
   validateBlockNumbers();
 
@@ -492,7 +490,7 @@ Function::Function(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace,
                    const Twine &name, Module *ParentModule)
     : GlobalObject(Ty, Value::FunctionVal, AllocMarker, Linkage, name,
                    computeAddrSpace(AddrSpace, ParentModule)),
-      NumArgs(Ty->getNumParams()), IsNewDbgInfoFormat(UseNewDbgInfoFormat) {
+      NumArgs(Ty->getNumParams()), IsNewDbgInfoFormat(true) {
   assert(FunctionType::isValidReturnType(getReturnType()) &&
          "invalid return type");
   setGlobalObjectSubClassData(0);
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 580b0af..90ec662 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -457,10 +457,10 @@ CallInst *IRBuilderBase::CreateInvariantStart(Value *Ptr, ConstantInt *Size) {
 }
 
 static MaybeAlign getAlign(Value *Ptr) {
-  if (auto *O = dyn_cast<GlobalObject>(Ptr))
-    return O->getAlign();
+  if (auto *V = dyn_cast<GlobalVariable>(Ptr))
+    return V->getAlign();
   if (auto *A = dyn_cast<GlobalAlias>(Ptr))
-    return A->getAliaseeObject()->getAlign();
+    return getAlign(A->getAliaseeObject());
   return {};
 }
 
diff --git a/llvm/lib/IR/IRPrintingPasses.cpp b/llvm/lib/IR/IRPrintingPasses.cpp
index a26bb0f..eb35377 100644
--- a/llvm/lib/IR/IRPrintingPasses.cpp
+++ b/llvm/lib/IR/IRPrintingPasses.cpp
@@ -24,8 +24,6 @@
 
 using namespace llvm;
 
-LLVM_ABI extern cl::opt<bool> UseNewDbgInfoFormat;
-
 namespace {
 
 class PrintModulePassWrapper : public ModulePass {
@@ -42,12 +40,10 @@ public:
         ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {}
 
   bool runOnModule(Module &M) override {
-    ScopedDbgInfoFormatSetter FormatSetter(M, UseNewDbgInfoFormat);
+    ScopedDbgInfoFormatSetter FormatSetter(M, true);
     // Remove intrinsic declarations when printing in the new format.
-    // TODO: Move this into Module::setIsNewDbgInfoFormat when we're ready to
-    // update test output.
-    if (UseNewDbgInfoFormat)
-      M.removeDebugIntrinsicDeclarations();
+    // TODO: consider removing this as debug-intrinsics are gone.
+    M.removeDebugIntrinsicDeclarations();
 
     if (llvm::isFunctionInPrintList("*")) {
       if (!Banner.empty())
@@ -88,7 +84,7 @@ public:
 
   // This pass just prints a banner followed by the function as it's processed.
   bool runOnFunction(Function &F) override {
-    ScopedDbgInfoFormatSetter FormatSetter(F, UseNewDbgInfoFormat);
+    ScopedDbgInfoFormatSetter FormatSetter(F, true);
 
     if (isFunctionInPrintList(F.getName())) {
       if (forcePrintModuleIR())
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 18ce165..109d516 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -130,8 +130,6 @@ BasicBlock::iterator Instruction::insertInto(BasicBlock *ParentBB,
   return getIterator();
 }
 
-LLVM_ABI extern cl::opt<bool> UseNewDbgInfoFormat;
-
 void Instruction::insertBefore(BasicBlock &BB,
                                InstListType::iterator InsertPos) {
   assert(!DebugMarker);
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 21f5c06..7b6083a 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -355,13 +355,19 @@ template <> struct MDNodeKeyImpl<DILocation> {
   }
 
   unsigned getHashValue() const {
-    return hash_combine(Line, Column, Scope, InlinedAt, ImplicitCode
 #ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
-                        ,
-                        AtomGroup, (uint8_t)AtomRank);
-#else
-    );
+    // Hashing AtomGroup and AtomRank substantially impacts performance whether
+    // Key Instructions is enabled or not. We can't detect whether it's enabled
+    // here cheaply; avoiding hashing zero values is a good approximation. This
+    // affects Key Instruction builds too, but any potential costs incurred by
+    // messing with the hash distribution* appear to still be massively
+    // outweighed by the overall compile time savings by performing this check.
+    // * (hash_combine(x) != hash_combine(x, 0))
+    if (AtomGroup || AtomRank)
+      return hash_combine(Line, Column, Scope, InlinedAt, ImplicitCode,
+                          AtomGroup, (uint8_t)AtomRank);
 #endif
+    return hash_combine(Line, Column, Scope, InlinedAt, ImplicitCode);
   }
 };
 
diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp
index 110636b..fd69e30 100644
--- a/llvm/lib/IR/LegacyPassManager.cpp
+++ b/llvm/lib/IR/LegacyPassManager.cpp
@@ -32,7 +32,6 @@
 
 using namespace llvm;
 
-LLVM_ABI extern cl::opt<bool> UseNewDbgInfoFormat;
 // See PassManagers.h for Pass Manager infrastructure overview.
 
 //===----------------------------------------------------------------------===//
@@ -530,7 +529,7 @@ bool PassManagerImpl::run(Module &M) {
   // RemoveDIs: if a command line flag is given, convert to the
   // DbgVariableRecord representation of debug-info for the duration of these
   // passes.
-  ScopedDbgInfoFormatSetter FormatSetter(M, UseNewDbgInfoFormat);
+  ScopedDbgInfoFormatSetter FormatSetter(M, true);
 
   for (ImmutablePass *ImPass : getImmutablePasses())
     Changed |= ImPass->doInitialization(M);
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 132a8c8..0a47f98 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -54,8 +54,6 @@
 
 using namespace llvm;
 
-LLVM_ABI extern cl::opt<bool> UseNewDbgInfoFormat;
-
 //===----------------------------------------------------------------------===//
 // Methods to implement the globals and functions lists.
 //
@@ -74,7 +72,7 @@ template class LLVM_EXPORT_TEMPLATE llvm::SymbolTableListTraits<GlobalIFunc>;
 Module::Module(StringRef MID, LLVMContext &C)
     : Context(C), ValSymTab(std::make_unique<ValueSymbolTable>(-1)),
       ModuleID(std::string(MID)), SourceFileName(std::string(MID)),
-      IsNewDbgInfoFormat(UseNewDbgInfoFormat) {
+      IsNewDbgInfoFormat(true) {
   Context.addModule(this);
 }
 
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 54227de..541379e 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -31,6 +31,56 @@ static void setAArch64LibcallNames(RuntimeLibcallsInfo &Info,
   }
 }
 
+static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT) {
+  // Register based DivRem for AEABI (RTABI 4.2)
+  if (TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() ||
+      TT.isTargetMuslAEABI() || TT.isOSWindows()) {
+    if (TT.isOSWindows()) {
+      const struct {
+        const RTLIB::Libcall Op;
+        const char *const Name;
+        const CallingConv::ID CC;
+      } LibraryCalls[] = {
+          {RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS},
+          {RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS},
+          {RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS},
+          {RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS},
+
+          {RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS},
+          {RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS},
+          {RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS},
+          {RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS},
+      };
+
+      for (const auto &LC : LibraryCalls) {
+        Info.setLibcallName(LC.Op, LC.Name);
+        Info.setLibcallCallingConv(LC.Op, LC.CC);
+      }
+    } else {
+      const struct {
+        const RTLIB::Libcall Op;
+        const char *const Name;
+        const CallingConv::ID CC;
+      } LibraryCalls[] = {
+          {RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS},
+          {RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS},
+          {RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS},
+          {RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS},
+
+          {RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS},
+          {RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS},
+          {RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS},
+          {RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS},
+      };
+
+      for (const auto &LC : LibraryCalls) {
+        Info.setLibcallName(LC.Op, LC.Name);
+        Info.setLibcallCallingConv(LC.Op, LC.CC);
+      }
+    }
+  }
+}
+
 /// Set default libcall names. If a target wants to opt-out of a libcall it
 /// should be placed here.
 void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
@@ -129,6 +179,40 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
     setLibcallName(RTLIB::OLE_F128, "__lekf2");
     setLibcallName(RTLIB::OGT_F128, "__gtkf2");
     setLibcallName(RTLIB::UO_F128, "__unordkf2");
+
+    setLibcallName(RTLIB::LOG_F128, "logf128");
+    setLibcallName(RTLIB::LOG2_F128, "log2f128");
+    setLibcallName(RTLIB::LOG10_F128, "log10f128");
+    setLibcallName(RTLIB::EXP_F128, "expf128");
+    setLibcallName(RTLIB::EXP2_F128, "exp2f128");
+    setLibcallName(RTLIB::SIN_F128, "sinf128");
+    setLibcallName(RTLIB::COS_F128, "cosf128");
+    setLibcallName(RTLIB::SINCOS_F128, "sincosf128");
+    setLibcallName(RTLIB::POW_F128, "powf128");
+    setLibcallName(RTLIB::FMIN_F128, "fminf128");
+    setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
+    setLibcallName(RTLIB::REM_F128, "fmodf128");
+    setLibcallName(RTLIB::SQRT_F128, "sqrtf128");
+    setLibcallName(RTLIB::CEIL_F128, "ceilf128");
+    setLibcallName(RTLIB::FLOOR_F128, "floorf128");
+    setLibcallName(RTLIB::TRUNC_F128, "truncf128");
+    setLibcallName(RTLIB::ROUND_F128, "roundf128");
+    setLibcallName(RTLIB::LROUND_F128, "lroundf128");
+    setLibcallName(RTLIB::LLROUND_F128, "llroundf128");
+    setLibcallName(RTLIB::RINT_F128, "rintf128");
+    setLibcallName(RTLIB::LRINT_F128, "lrintf128");
+    setLibcallName(RTLIB::LLRINT_F128, "llrintf128");
+    setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128");
+    setLibcallName(RTLIB::FMA_F128, "fmaf128");
+    setLibcallName(RTLIB::FREXP_F128, "frexpf128");
+
+    if (TT.isOSAIX()) {
+      bool isPPC64 = TT.isPPC64();
+      setLibcallName(RTLIB::MEMCPY, isPPC64 ? "___memmove64" : "___memmove");
+      setLibcallName(RTLIB::MEMMOVE, isPPC64 ? "___memmove64" : "___memmove");
+      setLibcallName(RTLIB::MEMSET, isPPC64 ? "___memset64" : "___memset");
+      setLibcallName(RTLIB::BZERO, isPPC64 ? "___bzero64" : "___bzero");
+    }
   }
 
   // A few names are different on particular architectures or environments.
@@ -264,8 +348,9 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
 
   if (TT.getArch() == Triple::ArchType::aarch64)
     setAArch64LibcallNames(*this, TT);
-
-  if (TT.getArch() == Triple::ArchType::avr) {
+  else if (TT.isARM() || TT.isThumb())
+    setARMLibcallNames(*this, TT);
+  else if (TT.getArch() == Triple::ArchType::avr) {
     // Division rtlib functions (not supported), use divmod functions instead
     setLibcallName(RTLIB::SDIV_I8, nullptr);
     setLibcallName(RTLIB::SDIV_I16, nullptr);
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index 4b43b52..7e64992 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -790,8 +790,12 @@ VectorType *VectorType::get(Type *ElementType, ElementCount EC) {
 }
 
 bool VectorType::isValidElementType(Type *ElemTy) {
-  return ElemTy->isIntegerTy() || ElemTy->isFloatingPointTy() ||
-         ElemTy->isPointerTy() || ElemTy->getTypeID() == TypedPointerTyID;
+  if (ElemTy->isIntegerTy() || ElemTy->isFloatingPointTy() ||
+      ElemTy->isPointerTy() || ElemTy->getTypeID() == TypedPointerTyID)
+    return true;
+  if (auto *TTy = dyn_cast<TargetExtType>(ElemTy))
+    return TTy->hasProperty(TargetExtType::CanBeVectorElement);
+  return false;
 }
 
 //===----------------------------------------------------------------------===//
@@ -801,8 +805,9 @@ bool VectorType::isValidElementType(Type *ElemTy) {
 FixedVectorType *FixedVectorType::get(Type *ElementType, unsigned NumElts) {
   assert(NumElts > 0 && "#Elements of a VectorType must be greater than 0");
   assert(isValidElementType(ElementType) && "Element type of a VectorType must "
-                                            "be an integer, floating point, or "
-                                            "pointer type.");
+                                            "be an integer, floating point, "
+                                            "pointer type, or a valid target "
+                                            "extension type.");
 
   auto EC = ElementCount::getFixed(NumElts);
 
@@ -968,7 +973,11 @@ struct TargetTypeInfo {
 
   template <typename... ArgTys>
   TargetTypeInfo(Type *LayoutType, ArgTys... Properties)
-      : LayoutType(LayoutType), Properties((0 | ... | Properties)) {}
+      : LayoutType(LayoutType), Properties((0 | ... | Properties)) {
+    assert((!(this->Properties & TargetExtType::CanBeVectorElement) ||
+            LayoutType->isSized()) &&
+           "Vector element type must be sized");
+  }
 };
 } // anonymous namespace
 
@@ -1037,6 +1046,13 @@ static TargetTypeInfo getTargetTypeInfo(const TargetExtType *Ty) {
                           TargetExtType::CanBeGlobal);
   }
 
+  // Type used to test vector element target extension property.
+  // Can be removed once a public target extension type uses CanBeVectorElement.
+  if (Name == "llvm.test.vectorelement") {
+    return TargetTypeInfo(Type::getInt32Ty(C), TargetExtType::CanBeLocal,
+                          TargetExtType::CanBeVectorElement);
+  }
+
   return TargetTypeInfo(Type::getVoidTy(C));
 }
 
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index d6cb65d..02c16e2 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -957,30 +957,27 @@ uint64_t Value::getPointerDereferenceableBytes(const DataLayout &DL,
 
 Align Value::getPointerAlignment(const DataLayout &DL) const {
   assert(getType()->isPointerTy() && "must be pointer");
-  if (auto *GO = dyn_cast<GlobalObject>(this)) {
-    if (isa<Function>(GO)) {
-      Align FunctionPtrAlign = DL.getFunctionPtrAlign().valueOrOne();
-      switch (DL.getFunctionPtrAlignType()) {
-      case DataLayout::FunctionPtrAlignType::Independent:
-        return FunctionPtrAlign;
-      case DataLayout::FunctionPtrAlignType::MultipleOfFunctionAlign:
-        return std::max(FunctionPtrAlign, GO->getAlign().valueOrOne());
-      }
-      llvm_unreachable("Unhandled FunctionPtrAlignType");
+  if (const Function *F = dyn_cast<Function>(this)) {
+    Align FunctionPtrAlign = DL.getFunctionPtrAlign().valueOrOne();
+    switch (DL.getFunctionPtrAlignType()) {
+    case DataLayout::FunctionPtrAlignType::Independent:
+      return FunctionPtrAlign;
+    case DataLayout::FunctionPtrAlignType::MultipleOfFunctionAlign:
+      return std::max(FunctionPtrAlign, F->getAlign().valueOrOne());
     }
-    const MaybeAlign Alignment(GO->getAlign());
+    llvm_unreachable("Unhandled FunctionPtrAlignType");
+  } else if (auto *GVar = dyn_cast<GlobalVariable>(this)) {
+    const MaybeAlign Alignment(GVar->getAlign());
     if (!Alignment) {
-      if (auto *GVar = dyn_cast<GlobalVariable>(GO)) {
-        Type *ObjectType = GVar->getValueType();
-        if (ObjectType->isSized()) {
-          // If the object is defined in the current Module, we'll be giving
-          // it the preferred alignment. Otherwise, we have to assume that it
-          // may only have the minimum ABI alignment.
-          if (GVar->isStrongDefinitionForLinker())
-            return DL.getPreferredAlign(GVar);
-          else
-            return DL.getABITypeAlign(ObjectType);
-        }
+      Type *ObjectType = GVar->getValueType();
+      if (ObjectType->isSized()) {
+        // If the object is defined in the current Module, we'll be giving
+        // it the preferred alignment. Otherwise, we have to assume that it
+        // may only have the minimum ABI alignment.
+        if (GVar->isStrongDefinitionForLinker())
+          return DL.getPreferredAlign(GVar);
+        else
+          return DL.getABITypeAlign(ObjectType);
       }
     }
     return Alignment.valueOrOne();
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 2d03a7a..592bb6a 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -735,12 +735,6 @@ void Verifier::visitGlobalValue(const GlobalValue &GV) {
         "Global is external, but doesn't have external or weak linkage!", &GV);
 
   if (const GlobalObject *GO = dyn_cast<GlobalObject>(&GV)) {
-
-    if (MaybeAlign A = GO->getAlign()) {
-      Check(A->value() <= Value::MaximumAlignment,
-            "huge alignment values are unsupported", GO);
-    }
-
     if (const MDNode *Associated =
             GO->getMetadata(LLVMContext::MD_associated)) {
       Check(Associated->getNumOperands() == 1,
@@ -830,6 +824,11 @@ void Verifier::visitGlobalValue(const GlobalValue &GV) {
 void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
   Type *GVType = GV.getValueType();
 
+  if (MaybeAlign A = GV.getAlign()) {
+    Check(A->value() <= Value::MaximumAlignment,
+          "huge alignment values are unsupported", &GV);
+  }
+
   if (GV.hasInitializer()) {
     Check(GV.getInitializer()->getType() == GVType,
           "Global variable initializer type does not match global "
@@ -2869,6 +2868,11 @@ void Verifier::visitFunction(const Function &F) {
   Check(!F.hasStructRetAttr() || F.getReturnType()->isVoidTy(),
         "Invalid struct return type!", &F);
 
+  if (MaybeAlign A = F.getAlign()) {
+    Check(A->value() <= Value::MaximumAlignment,
+          "huge alignment values are unsupported", &F);
+  }
+
   AttributeList Attrs = F.getAttributes();
 
   Check(verifyAttributeCount(Attrs, FT->getNumParams()),
diff --git a/llvm/lib/IRPrinter/IRPrintingPasses.cpp b/llvm/lib/IRPrinter/IRPrintingPasses.cpp
index d6bd6c3..5fd6a09 100644
--- a/llvm/lib/IRPrinter/IRPrintingPasses.cpp
+++ b/llvm/lib/IRPrinter/IRPrintingPasses.cpp
@@ -23,8 +23,6 @@
 
 using namespace llvm;
 
-LLVM_ABI extern cl::opt<bool> UseNewDbgInfoFormat;
-
 PrintModulePass::PrintModulePass() : OS(dbgs()) {}
 PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner,
                                  bool ShouldPreserveUseListOrder,
@@ -34,12 +32,10 @@ PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner,
       EmitSummaryIndex(EmitSummaryIndex) {}
 
 PreservedAnalyses PrintModulePass::run(Module &M, ModuleAnalysisManager &AM) {
-  ScopedDbgInfoFormatSetter FormatSetter(M, UseNewDbgInfoFormat);
+  ScopedDbgInfoFormatSetter FormatSetter(M, true);
   // Remove intrinsic declarations when printing in the new format.
-  // TODO: Move this into Module::setIsNewDbgInfoFormat when we're ready to
-  // update test output.
-  if (UseNewDbgInfoFormat)
-    M.removeDebugIntrinsicDeclarations();
+  // TODO: consider removing this now that debug intrinsics are gone.
+  M.removeDebugIntrinsicDeclarations();
 
   if (llvm::isFunctionInPrintList("*")) {
     if (!Banner.empty())
@@ -76,7 +72,7 @@ PrintFunctionPass::PrintFunctionPass(raw_ostream &OS, const std::string &Banner)
 
 PreservedAnalyses PrintFunctionPass::run(Function &F,
                                          FunctionAnalysisManager &) {
-  ScopedDbgInfoFormatSetter FormatSetter(F, UseNewDbgInfoFormat);
+  ScopedDbgInfoFormatSetter FormatSetter(F, true);
 
   if (isFunctionInPrintList(F.getName())) {
     if (forcePrintModuleIR())
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index ba120a0..df39507 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -70,8 +70,6 @@ using namespace object;
 
 #define DEBUG_TYPE "lto"
 
-LLVM_ABI extern cl::opt<bool> UseNewDbgInfoFormat;
-
 static cl::opt<bool>
     DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden,
                    cl::desc("Dump the SCCs in the ThinLTO index's callgraph"));
@@ -602,7 +600,7 @@ LTO::RegularLTOState::RegularLTOState(unsigned ParallelCodeGenParallelismLevel,
     : ParallelCodeGenParallelismLevel(ParallelCodeGenParallelismLevel),
       Ctx(Conf), CombinedModule(std::make_unique<Module>("ld-temp.o", Ctx)),
       Mover(std::make_unique<IRMover>(*CombinedModule)) {
-  CombinedModule->IsNewDbgInfoFormat = UseNewDbgInfoFormat;
+  CombinedModule->IsNewDbgInfoFormat = true;
 }
 
 LTO::ThinLTOState::ThinLTOState(ThinBackend BackendParam)
diff --git a/llvm/lib/LTO/LTOModule.cpp b/llvm/lib/LTO/LTOModule.cpp
index 749ae63..e0a9758 100644
--- a/llvm/lib/LTO/LTOModule.cpp
+++ b/llvm/lib/LTO/LTOModule.cpp
@@ -414,8 +414,11 @@ void LTOModule::addDefinedFunctionSymbol(StringRef Name, const GlobalValue *F) {
 
 void LTOModule::addDefinedSymbol(StringRef Name, const GlobalValue *def,
                                  bool isFunction) {
-  const GlobalObject *go = dyn_cast<GlobalObject>(def);
-  uint32_t attr = go ? Log2(go->getAlign().valueOrOne()) : 0;
+  uint32_t attr = 0;
+  if (auto *gv = dyn_cast<GlobalVariable>(def))
+    attr = Log2(gv->getAlign().valueOrOne());
+  else if (auto *f = dyn_cast<Function>(def))
+    attr = Log2(f->getAlign().valueOrOne());
 
   // set permissions part
   if (isFunction) {
diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.h b/llvm/lib/ObjCopy/MachO/MachOObject.h
index 79eb0133..13ac87e 100644
--- a/llvm/lib/ObjCopy/MachO/MachOObject.h
+++ b/llvm/lib/ObjCopy/MachO/MachOObject.h
@@ -71,7 +71,7 @@ struct Section {
   }
 
   bool hasValidOffset() const {
-    return !(isVirtualSection() || (OriginalOffset && *OriginalOffset == 0));
+    return !(isVirtualSection() || OriginalOffset == 0);
   }
 };
 
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index bf6d519..dd24055 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -736,8 +736,7 @@ void ELFObjectFileBase::setARMSubArch(Triple &TheTriple) const {
     case ARMBuildAttrs::v7: {
       std::optional<unsigned> ArchProfileAttr =
           Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch_profile);
-      if (ArchProfileAttr &&
-          *ArchProfileAttr == ARMBuildAttrs::MicroControllerProfile)
+      if (ArchProfileAttr == ARMBuildAttrs::MicroControllerProfile)
         Triple += "v7m";
       else
         Triple += "v7";
diff --git a/llvm/lib/SandboxIR/Constant.cpp b/llvm/lib/SandboxIR/Constant.cpp
index fa79a01..82cf087 100644
--- a/llvm/lib/SandboxIR/Constant.cpp
+++ b/llvm/lib/SandboxIR/Constant.cpp
@@ -282,20 +282,11 @@ PoisonValue *PoisonValue::getElementValue(unsigned Idx) const {
       cast<llvm::PoisonValue>(Val)->getElementValue(Idx)));
 }
 
-void GlobalObject::setAlignment(MaybeAlign Align) {
+void GlobalVariable::setAlignment(MaybeAlign Align) {
   Ctx.getTracker()
-      .emplaceIfTracking<
-          GenericSetter<&GlobalObject::getAlign, &GlobalObject::setAlignment>>(
-          this);
-  cast<llvm::GlobalObject>(Val)->setAlignment(Align);
-}
-
-void GlobalObject::setGlobalObjectSubClassData(unsigned V) {
-  Ctx.getTracker()
-      .emplaceIfTracking<
-          GenericSetter<&GlobalObject::getGlobalObjectSubClassData,
-                        &GlobalObject::setGlobalObjectSubClassData>>(this);
-  cast<llvm::GlobalObject>(Val)->setGlobalObjectSubClassData(V);
+      .emplaceIfTracking<GenericSetter<&GlobalVariable::getAlign,
+                                       &GlobalVariable::setAlignment>>(this);
+  cast<llvm::GlobalVariable>(Val)->setAlignment(Align);
 }
 
 void GlobalObject::setSection(StringRef S) {
diff --git a/llvm/lib/SandboxIR/Function.cpp b/llvm/lib/SandboxIR/Function.cpp
index f7a1d35..f400389 100644
--- a/llvm/lib/SandboxIR/Function.cpp
+++ b/llvm/lib/SandboxIR/Function.cpp
@@ -17,6 +17,13 @@ FunctionType *Function::getFunctionType() const {
       Ctx.getType(cast<llvm::Function>(Val)->getFunctionType()));
 }
 
+void Function::setAlignment(MaybeAlign Align) {
+  Ctx.getTracker()
+      .emplaceIfTracking<
+          GenericSetter<&Function::getAlign, &Function::setAlignment>>(this);
+  cast<llvm::Function>(Val)->setAlignment(Align);
+}
+
 #ifndef NDEBUG
 void Function::dumpNameAndArgs(raw_ostream &OS) const {
   auto *F = cast<llvm::Function>(Val);
diff --git a/llvm/lib/Support/DynamicLibrary.cpp b/llvm/lib/Support/DynamicLibrary.cpp
index 531c035..f1c15c0 100644
--- a/llvm/lib/Support/DynamicLibrary.cpp
+++ b/llvm/lib/Support/DynamicLibrary.cpp
@@ -25,7 +25,7 @@ using namespace llvm::sys;
 class DynamicLibrary::HandleSet {
   typedef std::vector<void *> HandleList;
   HandleList Handles;
-  void *Process = nullptr;
+  void *Process = &Invalid;
 
 public:
   static void *DLOpen(const char *Filename, std::string *Err);
@@ -58,7 +58,7 @@ public:
       Handles.push_back(Handle);
     } else {
 #ifndef _WIN32
-      if (Process) {
+      if (Process != &Invalid) {
         if (CanClose)
           DLClose(Process);
         if (Process == Handle)
@@ -97,11 +97,11 @@ public:
     assert(!((Order & SO_LoadedFirst) && (Order & SO_LoadedLast)) &&
            "Invalid Ordering");
 
-    if (!Process || (Order & SO_LoadedFirst)) {
+    if (Process == &Invalid || (Order & SO_LoadedFirst)) {
       if (void *Ptr = LibLookup(Symbol, Order))
         return Ptr;
     }
-    if (Process) {
+    if (Process != &Invalid) {
       // Use OS facilities to search the current binary and all loaded libs.
       if (void *Ptr = DLSym(Process, Symbol))
         return Ptr;
diff --git a/llvm/lib/Support/Unix/DynamicLibrary.inc b/llvm/lib/Support/Unix/DynamicLibrary.inc
index 7452913..b6f8e38 100644
--- a/llvm/lib/Support/Unix/DynamicLibrary.inc
+++ b/llvm/lib/Support/Unix/DynamicLibrary.inc
@@ -17,7 +17,7 @@ DynamicLibrary::HandleSet::~HandleSet() {
   // Close the libraries in reverse order.
   for (void *Handle : llvm::reverse(Handles))
     ::dlclose(Handle);
-  if (Process)
+  if (Process != &Invalid)
     ::dlclose(Process);
 
   // llvm_shutdown called, Return to default
diff --git a/llvm/lib/Support/Windows/DynamicLibrary.inc b/llvm/lib/Support/Windows/DynamicLibrary.inc
index c434bd6..4f8c96e 100644
--- a/llvm/lib/Support/Windows/DynamicLibrary.inc
+++ b/llvm/lib/Support/Windows/DynamicLibrary.inc
@@ -26,7 +26,7 @@ DynamicLibrary::HandleSet::~HandleSet() {
     FreeLibrary(HMODULE(Handle));
 
   // 'Process' should not be released on Windows.
-  assert((!Process || Process == this) && "Bad Handle");
+  assert((Process == &Invalid || Process == this) && "Bad Handle");
   // llvm_shutdown called, Return to default
   DynamicLibrary::SearchOrder = DynamicLibrary::SO_Linker;
 }
@@ -60,7 +60,7 @@ static DynamicLibrary::HandleSet *IsOpenedHandlesInstance(void *Handle) {
 
 void DynamicLibrary::HandleSet::DLClose(void *Handle) {
   if (HandleSet *HS = IsOpenedHandlesInstance(Handle))
-    HS->Process = nullptr; // Just drop the *Process* handle.
+    HS->Process = &Invalid; // Just drop the *Process* handle.
   else
     FreeLibrary((HMODULE)Handle);
 }
@@ -89,7 +89,7 @@ void *DynamicLibrary::HandleSet::DLSym(void *Handle, const char *Symbol) {
     return (void *)uintptr_t(GetProcAddress((HMODULE)Handle, Symbol));
 
   // Could have done a dlclose on the *Process* handle
-  if (!HS->Process)
+  if (HS->Process == &Invalid)
     return nullptr;
 
   // Trials indicate EnumProcessModulesEx is consistantly faster than using
diff --git a/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index 08a6fa2..c85adcf 100644
--- a/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -73,11 +73,11 @@ private:
   bool isProfitableToTransform(const MachineInstr &MI) const;
 
   // transformInstruction - Perform the transformation of an instruction
-  // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
+  // to its equivalent AdvSIMD scalar instruction. Update inputs and outputs
   // to be the correct register class, minimizing cross-class copies.
   void transformInstruction(MachineInstr &MI);
 
-  // processMachineBasicBlock - Main optimzation loop.
+  // processMachineBasicBlock - Main optimization loop.
   bool processMachineBasicBlock(MachineBasicBlock *MBB);
 
 public:
@@ -231,7 +231,7 @@ bool AArch64AdvSIMDScalar::isProfitableToTransform(
 
   // If any of the uses of the original instructions is a cross class copy,
   // that's a copy that will be removable if we transform. Likewise, if
-  // any of the uses is a transformable instruction, it's likely the tranforms
+  // any of the uses is a transformable instruction, it's likely the transforms
   // will chain, enabling us to save a copy there, too. This is an aggressive
   // heuristic that approximates the graph based cost analysis described above.
   Register Dst = MI.getOperand(0).getReg();
@@ -280,7 +280,7 @@ static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr &MI,
 }
 
 // transformInstruction - Perform the transformation of an instruction
-// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
+// to its equivalent AdvSIMD scalar instruction. Update inputs and outputs
 // to be the correct register class, minimizing cross-class copies.
 void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) {
   LLVM_DEBUG(dbgs() << "Scalar transform: " << MI);
@@ -372,7 +372,7 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) {
   ++NumScalarInsnsUsed;
 }
 
-// processMachineBasicBlock - Main optimzation loop.
+// processMachineBasicBlock - Main optimization loop.
 bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
   bool Changed = false;
   for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) {
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 0d019bd..3f92c1d 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -467,7 +467,7 @@ void AArch64AsmPrinter::emitAttributes(unsigned Flags,
   PAuthABIVersion = (uint64_t(-1) == PAuthABIVersion) ? 0 : PAuthABIVersion;
 
   if (PAuthABIPlatform || PAuthABIVersion) {
-    TS->emitAtributesSubsection(
+    TS->emitAttributesSubsection(
         AArch64BuildAttributes::getVendorName(
             AArch64BuildAttributes::AEABI_PAUTHABI),
         AArch64BuildAttributes::SubsectionOptional::REQUIRED,
@@ -490,7 +490,7 @@ void AArch64AsmPrinter::emitAttributes(unsigned Flags,
       (Flags & AArch64BuildAttributes::Feature_GCS_Flag) ? 1 : 0;
 
   if (BTIValue || PACValue || GCSValue) {
-    TS->emitAtributesSubsection(
+    TS->emitAttributesSubsection(
         AArch64BuildAttributes::getVendorName(
             AArch64BuildAttributes::AEABI_FEATURE_AND_BITS),
         AArch64BuildAttributes::SubsectionOptional::OPTIONAL,
@@ -3531,7 +3531,7 @@ const MCExpr *AArch64AsmPrinter::lowerConstant(const Constant *CV,
 char AArch64AsmPrinter::ID = 0;
 
 INITIALIZE_PASS(AArch64AsmPrinter, "aarch64-asm-printer",
-                "AArch64 Assmebly Printer", false, false)
+                "AArch64 Assembly Printer", false, false)
 
 // Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64AsmPrinter() {
diff --git a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
index 64f21c4..53e8e43 100644
--- a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -232,7 +232,7 @@ static bool isCandidateLoad(const MachineInstr &MI) {
   }
 }
 
-/// Check whether the given instruction can load a litteral.
+/// Check whether the given instruction can load a literal.
 static bool supportLoadFromLiteral(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
@@ -247,7 +247,7 @@ static bool supportLoadFromLiteral(const MachineInstr &MI) {
   }
 }
 
-/// Number of GPR registers traked by mapRegToGPRIndex()
+/// Number of GPR registers tracked by mapRegToGPRIndex()
 static const unsigned N_GPR_REGS = 31;
 /// Map register number to index from 0-30.
 static int mapRegToGPRIndex(MCRegister Reg) {
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index f84e838..571e269 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -172,8 +172,23 @@ def form_duplane : GICombineRule <
   (apply [{ applyDupLane(*${root}, MRI, B, ${matchinfo}); }])
 >;
 
-def shuffle_vector_lowering : GICombineGroup<[dup, rev, ext, zip, uzp, trn, fullrev,
-                                              form_duplane, shuf_to_ins]>;
+// Clean up G_UNMERGE(G_DUPLANE16) -> G_DUPLANE16
+class unmerge_duplane<Instruction Op> : GICombineRule <
+  (defs root:$root),
+  (match (Op $a, $src, $c),
+         (G_UNMERGE_VALUES $d1, $d2, $a):$root,
+         [{ return MRI.getType(${d1}.getReg()).getSizeInBits() == 64; }]),
+  (apply (GIReplaceReg $d2, $d1), (Op $d1, $src, $c))
+>;
+def unmerge_duplane8 : unmerge_duplane<G_DUPLANE8>;
+def unmerge_duplane16 : unmerge_duplane<G_DUPLANE16>;
+def unmerge_duplane32 : unmerge_duplane<G_DUPLANE32>;
+// G_DUPLANE64 is not included as the result in scalar.
+def unmerge_duplanes : GICombineGroup<[unmerge_duplane8, unmerge_duplane16,
+                                       unmerge_duplane32]>;
+
+def shuffle_vector_lowering : GICombineGroup<[dup, form_duplane, rev, ext, zip,
+                                              uzp, trn, fullrev, shuf_to_ins]>;
 
 // Turn G_UNMERGE_VALUES -> G_EXTRACT_VECTOR_ELT's
 def vector_unmerge_lowering : GICombineRule <
@@ -325,7 +340,8 @@ def AArch64PostLegalizerLowering
                         lower_vector_fcmp, form_truncstore,
                         vector_sext_inreg_to_shift,
                         unmerge_ext_to_unmerge, lower_mulv2s64,
-                        vector_unmerge_lowering, insertelt_nonconst]> {
+                        vector_unmerge_lowering, insertelt_nonconst,
+                        unmerge_duplanes]> {
 }
 
 // Post-legalization combines which are primarily optimizations.
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 9b59ee6..484bc2a 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -573,7 +573,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
   // Update the CFG first.
   updateTailPHIs();
 
-  // Save successor probabilties before removing CmpBB and Tail from their
+  // Save successor probabilities before removing CmpBB and Tail from their
   // parents.
   BranchProbability Head2CmpBB = MBPI->getEdgeProbability(Head, CmpBB);
   BranchProbability CmpBB2Tail = MBPI->getEdgeProbability(CmpBB, Tail);
@@ -581,7 +581,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
   Head->removeSuccessor(CmpBB);
   CmpBB->removeSuccessor(Tail);
 
-  // If Head and CmpBB had successor probabilties, udpate the probabilities to
+  // If Head and CmpBB had successor probabilities, update the probabilities to
   // reflect the ccmp-conversion.
   if (Head->hasSuccessorProbabilities() && CmpBB->hasSuccessorProbabilities()) {
 
@@ -596,7 +596,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
                              Head2Tail + Head2CmpBB * CmpBB2Tail);
 
     // We will transfer successors of CmpBB to Head in a moment without
-    // normalizing the successor probabilities. Set the successor probabilites
+    // normalizing the successor probabilities. Set the successor probabilities
     // before doing so.
     //
     // Pr(I|Head) = Pr(CmpBB|Head) * Pr(I|CmpBB).
diff --git a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 71284b0..987dfbc 100644
--- a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -64,10 +64,10 @@ static bool usesFrameIndex(const MachineInstr &MI) {
   return false;
 }
 
-// Instructions that lose their 'read' operation for a subesquent fence acquire
+// Instructions that lose their 'read' operation for a subsequent fence acquire
 // (DMB LD) once the zero register is used.
 //
-// WARNING: The aquire variants of the instructions are also affected, but they
+// WARNING: The acquire variants of the instructions are also affected, but they
 // are split out into `atomicBarrierDroppedOnZero()` to support annotations on
 // assembly.
 static bool atomicReadDroppedOnZero(unsigned Opcode) {
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index bb7e6b6..9d74bb5 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -508,7 +508,7 @@ Register AArch64FastISel::materializeGV(const GlobalValue *GV) {
       // also uses BuildMI for making an ADRP (+ MOVK) + ADD, but the operands
       // are not exactly 1:1 with FastISel so we cannot easily abstract this
       // out. At some point, it would be nice to find a way to not have this
-      // duplciate code.
+      // duplicate code.
       Register DstReg = createResultReg(&AArch64::GPR64commonRegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(AArch64::MOVKXi),
               DstReg)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 2268323..3335ee0 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -399,7 +399,7 @@ static const unsigned DefaultSafeSPDisplacement = 255;
 /// size limit beyond which some of these instructions will require a scratch
 /// register during their expansion later.
 static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
-  // FIXME: For now, just conservatively guestimate based on unscaled indexing
+  // FIXME: For now, just conservatively guesstimate based on unscaled indexing
   // range. We'll end up allocating an unnecessary spill slot a lot, but
   // realistically that's not a big deal at this stage of the game.
   for (MachineBasicBlock &MBB : MF) {
@@ -647,7 +647,7 @@ void AArch64FrameLowering::emitCalleeSavedSVELocations(
       continue;
 
     // Not all unwinders may know about SVE registers, so assume the lowest
-    // common demoninator.
+    // common denominator.
     assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
     MCRegister Reg = Info.getReg();
     if (!static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
@@ -801,7 +801,7 @@ void AArch64FrameLowering::allocateStackSpace(
         .addImm(InitialOffset.getFixed())
         .addImm(InitialOffset.getScalable());
     // The fixed allocation may leave unprobed bytes at the top of the
-    // stack. If we have subsequent alocation (e.g. if we have variable-sized
+    // stack. If we have subsequent allocation (e.g. if we have variable-sized
     // objects), we need to issue an extra probe, so these allocations start in
     // a known state.
     if (FollowupAllocs) {
@@ -2054,7 +2054,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       HasWinCFI = true;
       // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
       // exceed this amount.  We need to move at most 2^24 - 1 into x15.
-      // This is at most two instructions, MOVZ follwed by MOVK.
+      // This is at most two instructions, MOVZ followed by MOVK.
       // TODO: Fix to use multiple stack alloc unwind codes for stacks
       // exceeding 256MB in size.
       if (NumBytes >= (1 << 28))
@@ -2400,7 +2400,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
           MachineInstr::FrameDestroy, PrologueSaveSize);
     } else {
       // If not, make sure to emit an add after the last ldp.
-      // We're doing this by transfering the size to be restored from the
+      // We're doing this by transferring the size to be restored from the
       // adjustment *before* the CSR pops to the adjustment *after* the CSR
       // pops.
       AfterCSRPopSize += PrologueSaveSize;
@@ -2949,7 +2949,7 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
                                              const TargetRegisterInfo *TRI) {
   // If we are generating register pairs for a Windows function that requires
   // EH support, then pair consecutive registers only.  There are no unwind
-  // opcodes for saves/restores of non-consectuve register pairs.
+  // opcodes for saves/restores of non-consecutive register pairs.
   // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x,
   // save_lrpair.
   // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
@@ -3187,7 +3187,7 @@ static void computeCalleeSaveRegisterPairs(
         RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
       RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
 
-    // Realign the scalable offset if necesary.  This is relevant when
+    // Realign the scalable offset if necessary.  This is relevant when
     // spilling predicates on Windows.
     if (RPI.isScalable() && ScalableByteOffset % Scale != 0) {
       ScalableByteOffset = alignTo(ScalableByteOffset, Scale);
@@ -5022,7 +5022,7 @@ MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
   }
 
   // Find contiguous runs of tagged memory and emit shorter instruction
-  // sequencies for them when possible.
+  // sequences for them when possible.
   TagStoreEdit TSE(MBB, FirstZeroData);
   std::optional<int64_t> EndOffset;
   for (auto &Instr : Instrs) {
@@ -5591,7 +5591,7 @@ void AArch64FrameLowering::emitRemarks(
           unsigned RegTy = StackAccess::AccessType::GPR;
           if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) {
             // SPILL_PPR_TO_ZPR_SLOT_PSEUDO and FILL_PPR_FROM_ZPR_SLOT_PSEUDO
-            // spill/fill the predicate as a data vector (so are an FPR acess).
+            // spill/fill the predicate as a data vector (so are an FPR access).
             if (MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO &&
                 MI.getOpcode() != AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO &&
                 AArch64::PPRRegClass.contains(MI.getOperand(0).getReg())) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 34f6db9..11cb91f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -991,7 +991,7 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
 }
 
 /// SelectArithUXTXRegister - Select a "UXTX register" operand. This
-/// operand is refered by the instructions have SP operand
+/// operand is referred by the instructions have SP operand
 bool AArch64DAGToDAGISel::SelectArithUXTXRegister(SDValue N, SDValue &Reg,
                                                   SDValue &Shift) {
   unsigned ShiftVal = 0;
@@ -2841,7 +2841,7 @@ static bool isBitfieldDstMask(uint64_t DstMask, const APInt &BitsToBeInserted,
 // After #1, x useful bits are 0x7, then the useful bits of x, live through
 // y.
 // After #2, the useful bits of x are 0x4.
-// However, if x is used on an unpredicatable instruction, then all its bits
+// However, if x is used on an unpredictable instruction, then all its bits
 // are useful.
 // E.g.
 // 1. y = x & 0x7
@@ -3611,7 +3611,7 @@ static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
       DstLSB = 0;
       Width = ImmS - ImmR + 1;
       // FIXME: This constraint is to catch bitfield insertion we may
-      // want to widen the pattern if we want to grab general bitfied
+      // want to widen the pattern if we want to grab general bitfield
       // move case
       if (Width <= 0)
         continue;
@@ -3999,7 +3999,7 @@ static int getIntOperandFromRegisterString(StringRef RegString) {
 
 // Lower the read_register intrinsic to an MRS instruction node if the special
 // register string argument is either of the form detailed in the ALCE (the
-// form described in getIntOperandsFromRegsterString) or is a named register
+// form described in getIntOperandsFromRegisterString) or is a named register
 // known by the MRS SysReg mapper.
 bool AArch64DAGToDAGISel::tryReadRegister(SDNode *N) {
   const auto *MD = cast<MDNodeSDNode>(N->getOperand(1));
@@ -4060,7 +4060,7 @@ bool AArch64DAGToDAGISel::tryReadRegister(SDNode *N) {
 
 // Lower the write_register intrinsic to an MSR instruction node if the special
 // register string argument is either of the form detailed in the ALCE (the
-// form described in getIntOperandsFromRegsterString) or is a named register
+// form described in getIntOperandsFromRegisterString) or is a named register
 // known by the MSR SysReg mapper.
 bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
   const auto *MD = cast<MDNodeSDNode>(N->getOperand(1));
@@ -7278,7 +7278,7 @@ static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT,
 }
 
 /// Return the EVT of the data associated to a memory operation in \p
-/// Root. If such EVT cannot be retrived, it returns an invalid EVT.
+/// Root. If such EVT cannot be retrieved, it returns an invalid EVT.
 static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) {
   if (auto *MemIntr = dyn_cast<MemIntrinsicSDNode>(Root))
     return MemIntr->getMemoryVT();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 379f07e..d30cfa2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5367,7 +5367,7 @@ static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
       return AArch64ISD::UMULL;
   } else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(N0, Mask) &&
              DAG.MaskedValueIsZero(N1, Mask)) {
-    // For v2i64 we look more aggresively at both operands being zero, to avoid
+    // For v2i64 we look more aggressively at both operands being zero, to avoid
     // scalarization.
     return AArch64ISD::UMULL;
   }
@@ -5844,7 +5844,7 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
       return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
     } else {
-      report_fatal_error("Unexpected type for AArch64 NEON intrinic");
+      report_fatal_error("Unexpected type for AArch64 NEON intrinsic");
     }
   }
   case Intrinsic::aarch64_neon_pmull64: {
@@ -5870,19 +5870,19 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       // If the operand is an higher half itself, rewrite it to
       // extract_high_v2i64; this way aarch64_neon_pmull64 could
       // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
-      if (NLane && *NLane == 1)
+      if (NLane == 1)
         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
                            N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
 
       // Operand N is not a higher half but the other operand is.
-      if (OtherLane && *OtherLane == 1) {
+      if (OtherLane == 1) {
         // If this operand is a lower half, rewrite it to
         // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
         // align lanes of two operands. A roundtrip sequence (to move from lane
         // 1 to lane 0) is like this:
         //   mov x8, v0.d[1]
         //   fmov d0, x8
-        if (NLane && *NLane == 0)
+        if (NLane == 0)
           return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
                              DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
                                          N.getOperand(0),
@@ -8630,9 +8630,9 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
   if (SizeInBits < 8)
     return false;
 
-  APInt RequredZero(SizeInBits, 0xFE);
+  APInt RequiredZero(SizeInBits, 0xFE);
   KnownBits Bits = DAG.computeKnownBits(Arg, 4);
-  bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
+  bool ZExtBool = (Bits.Zero & RequiredZero) == RequiredZero;
   return ZExtBool;
 }
 
@@ -13536,7 +13536,7 @@ static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
         OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
       } else {
         assert(VT.getScalarSizeInBits() == 32 &&
-               "Expected 16 or 32 bit shuffle elemements");
+               "Expected 16 or 32 bit shuffle elements");
         Input = DAG.getBitcast(MVT::v2f64, Input);
         OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
       }
@@ -13941,7 +13941,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
       unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
       MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
       V1 = DAG.getBitcast(NewVecTy, V1);
-      // Constuct the DUP instruction
+      // Construct the DUP instruction
       V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
       // Cast back to the original type
       return DAG.getBitcast(VT, V1);
@@ -16900,12 +16900,12 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
 }
 
 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
-                                          Align &RequiredAligment) const {
+                                          Align &RequiredAlignment) const {
   if (!LoadedType.isSimple() ||
       (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
     return false;
   // Cyclone supports unaligned accesses.
-  RequiredAligment = Align(1);
+  RequiredAlignment = Align(1);
   unsigned NumBits = LoadedType.getSizeInBits();
   return NumBits == 32 || NumBits == 64;
 }
@@ -18028,7 +18028,7 @@ static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
       EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
     return SDValue();
 
-  // Pattern is dectected. Let's convert it to sequence of nodes.
+  // Pattern is detected. Let's convert it to sequence of nodes.
   SDLoc DL(N);
 
   // First, create the node pattern of UABD/SABD.
@@ -18246,10 +18246,10 @@ static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
                                 DAG.getConstant(I * 16, DL, MVT::i64));
   SDValue Dot =
       DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
-  SDValue VecReudceAdd8 =
+  SDValue VecReduceAdd8 =
       DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
   return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
-                     VecReudceAdd8);
+                     VecReduceAdd8);
 }
 
 // Given an (integer) vecreduce, we know the order of the inputs does not
@@ -21474,7 +21474,7 @@ static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
   case Intrinsic::aarch64_neon_ushl:
     // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
     // left shift for positive shift amounts. For negative shifts we can use a
-    // VASHR/VLSHR as appropiate.
+    // VASHR/VLSHR as appropriate.
     if (ShiftAmount < 0) {
       Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
                                                    : AArch64ISD::VLSHR;
@@ -22880,7 +22880,7 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
 }
 
 static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) {
-  assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
+  assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexpected Opcode!");
 
   // splice(pg, op1, undef) -> op1
   if (N->getOperand(2).isUndef())
@@ -23616,10 +23616,10 @@ static SDValue performLOADCombine(SDNode *N,
                   LD->getMemOperand()->getFlags(), LD->getAAInfo());
   SDValue UndefVector = DAG.getUNDEF(NewVT);
   SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
-  SDValue ExtendedReminingLoad =
+  SDValue ExtendedRemainingLoad =
       DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
                   {UndefVector, RemainingLoad, InsertIdx});
-  LoadOps.push_back(ExtendedReminingLoad);
+  LoadOps.push_back(ExtendedRemainingLoad);
   LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
   EVT ConcatVT =
       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index d76c4ce..e0b6c1b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -207,7 +207,7 @@ public:
   bool optimizeExtendOrTruncateConversion(
       Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override;
 
-  bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override;
+  bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override;
 
   unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 4796c27..9078748 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -282,7 +282,7 @@ def CondCode : AsmOperandClass {
   let DiagnosticType = "InvalidCondCode";
 }
 
-// A 32-bit register pasrsed as 64-bit
+// A 32-bit register parsed as 64-bit
 def GPR32as64Operand : AsmOperandClass {
   let Name = "GPR32as64";
   let ParserMethod =
@@ -292,7 +292,7 @@ def GPR32as64 : RegisterOperand<GPR32> {
   let ParserMatchClass = GPR32as64Operand;
 }
 
-// A 64-bit register pasrsed as 32-bit
+// A 64-bit register parsed as 32-bit
 def GPR64as32Operand : AsmOperandClass {
   let Name = "GPR64as32";
   let ParserMethod =
@@ -580,7 +580,7 @@ def uimm5s8 : Operand<i64>, ImmLeaf<i64,
   let PrintMethod = "printImmScale<8>";
 }
 
-// tuimm5sN predicate - similiar to uimm5sN, but use TImmLeaf (TargetConstant)
+// tuimm5sN predicate - similar to uimm5sN, but use TImmLeaf (TargetConstant)
 // instead of ImmLeaf (Constant)
 def tuimm5s2 : Operand<i64>, TImmLeaf<i64,
                 [{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }],
@@ -3776,7 +3776,7 @@ multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, DAGOperand regtype,
 // Same as StoreUI, but take a RegisterOperand. This is used by GlobalISel to
 // substitute zero-registers automatically.
 //
-// TODO: Roll out zero-register subtitution to GPR32/GPR64 and fold this back
+// TODO: Roll out zero-register substitution to GPR32/GPR64 and fold this back
 //       into StoreUI.
 multiclass StoreUIz<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
              Operand indextype, string asm, list<dag> pattern> {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index a229b71..951cb93 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -8775,7 +8775,7 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
     return false;
 
   // Find Definition.
-  assert(MI.getParent() && "Incomplete machine instruciton\n");
+  assert(MI.getParent() && "Incomplete machine instruction\n");
   MachineBasicBlock *MBB = MI.getParent();
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
@@ -9077,7 +9077,7 @@ AArch64InstrInfo::getOutliningCandidateInfo(
   // address signing attributes, i.e., all share the same value for the
   // attribute "sign-return-address" and all share the same type of key they
   // are signed with.
-  // Additionally we require all functions to simultaniously either support
+  // Additionally we require all functions to simultaneously either support
   // v8.3a features or not. Otherwise an outlined function could get signed
   // using dedicated v8.3 instructions and a call from a function that doesn't
   // support v8.3 instructions would therefore be invalid.
@@ -10319,7 +10319,7 @@ unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
 MachineBasicBlock::iterator
 AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
                                    Register TargetReg, bool FrameSetup) const {
-  assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP");
+  assert(TargetReg != AArch64::SP && "New top of stack cannot already be in SP");
 
   MachineBasicBlock &MBB = *MBBI->getParent();
   MachineFunction &MF = *MBB.getParent();
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 0ffaca9..7c255da 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -519,7 +519,7 @@ public:
   /// Returns true if the instruction has a shift by immediate that can be
   /// executed in one cycle less.
   static bool isFalkorShiftExtFast(const MachineInstr &MI);
-  /// Return true if the instructions is a SEH instruciton used for unwinding
+  /// Return true if the instructions is a SEH instruction used for unwinding
   /// on Windows.
   static bool isSEHInstruction(const MachineInstr &MI);
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 0dbce01..f5b66b7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -685,23 +685,35 @@ defm trunc_masked_scatter_i32 : masked_gather_scatter<trunc_masked_scatter_i32>;
 def top16Zero: PatLeaf<(i32 GPR32:$src), [{
   return Op.getValueType() == MVT::i32 &&
          CurDAG->MaskedValueIsZero(Op, APInt::getHighBitsSet(32, 16));
-  }]>;
+  }]> {
+  let GISelLeafPredicateCode = [{
+    return VT && VT->maskedValueIsZero(Reg, APInt::getHighBitsSet(32, 16)); }];
+}
 
 // top32Zero - answer true if the upper 32 bits of $src are 0, false otherwise
 def top32Zero: PatLeaf<(i64 GPR64:$src), [{
   return Op.getValueType() == MVT::i64 &&
          CurDAG->MaskedValueIsZero(Op, APInt::getHighBitsSet(64, 32));
-  }]>;
+  }]> {
+  let GISelLeafPredicateCode = [{ 
+    return VT && VT->maskedValueIsZero(Reg, APInt::getHighBitsSet(64, 32)); }];
+}
 
 // topbitsallzero - Return true if all bits except the lowest bit are known zero
 def topbitsallzero32: PatLeaf<(i32 GPR32:$src), [{
   return Op.getValueType() == MVT::i32 &&
          CurDAG->MaskedValueIsZero(Op, APInt::getHighBitsSet(32, 31));
-  }]>;
+  }]> {
+  let GISelLeafPredicateCode = [{ 
+    return VT && VT->maskedValueIsZero(Reg, APInt::getHighBitsSet(32, 31)); }];
+}
 def topbitsallzero64: PatLeaf<(i64 GPR64:$src), [{
   return Op.getValueType() == MVT::i64 &&
          CurDAG->MaskedValueIsZero(Op, APInt::getHighBitsSet(64, 63));
-  }]>;
+  }]> {
+  let GISelLeafPredicateCode = [{ 
+    return VT && VT->maskedValueIsZero(Reg, APInt::getHighBitsSet(64, 63)); }];
+}
 
 // Node definitions.
 // Compare-and-branch
@@ -853,7 +865,7 @@ def AArch64uzp2      : SDNode<"AArch64ISD::UZP2", SDT_AArch64Zip>;
 def AArch64trn1      : SDNode<"AArch64ISD::TRN1", SDT_AArch64Zip>;
 def AArch64trn2      : SDNode<"AArch64ISD::TRN2", SDT_AArch64Zip>;
 
-// Vector immedate moves
+// Vector immediate moves
 def AArch64movi_edit : SDNode<"AArch64ISD::MOVIedit", SDT_AArch64MOVIedit>;
 def AArch64movi_shift : SDNode<"AArch64ISD::MOVIshift", SDT_AArch64MOVIshift>;
 def AArch64movi_msl : SDNode<"AArch64ISD::MOVImsl", SDT_AArch64MOVIshift>;
@@ -1475,7 +1487,7 @@ let Predicates = [HasPCDPHINT] in {
 }
 
 // In order to be able to write readable assembly, LLVM should accept assembly
-// inputs that use Branch Target Indentification mnemonics, even with BTI disabled.
+// inputs that use Branch Target Identification mnemonics, even with BTI disabled.
 // However, in order to be compatible with other assemblers (e.g. GAS), LLVM
 // should not emit these mnemonics unless BTI is enabled.
 def : InstAlias<"bti",  (HINT 32), 0>;
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index b7da07a..f51f0d1 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -10,7 +10,7 @@
 // optimizations. This pass should be run after register allocation.
 //
 // The pass runs after the PrologEpilogInserter where we emit the CFI
-// instructions. In order to preserve the correctness of the unwind informaiton,
+// instructions. In order to preserve the correctness of the unwind information,
 // the pass should not change the order of any two instructions, one of which
 // has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix
 // to unwind information.
@@ -189,7 +189,7 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // pre or post indexed addressing with writeback. Scan backwards.
   // `MergeEither` is set to true if the combined instruction may be placed
   // either at the location of the load/store instruction or at the location of
-  // the update intruction.
+  // the update instruction.
   MachineBasicBlock::iterator
   findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit,
                                  bool &MergeEither);
@@ -1281,7 +1281,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
     // instruction contains the final value we care about we give it a new
     // debug-instr-number 3. Whereas, $w1 contains the final value that we care
     // about, therefore the LDP instruction is also given a new
-    // debug-instr-number 4. We have to add these subsitutions to the
+    // debug-instr-number 4. We have to add these substitutions to the
     // debugValueSubstitutions table. However, we also have to ensure that the
     // OpIndex that pointed to debug-instr-number 1 gets updated to 1, because
     // $w1 is the second operand of the LDP instruction.
@@ -2602,7 +2602,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
   ModifiedRegUnits.clear();
   UsedRegUnits.clear();
   unsigned Count = 0;
-  bool MemAcessBeforeSPPreInc = false;
+  bool MemAccessBeforeSPPreInc = false;
   MergeEither = true;
   do {
     MBBI = prev_nodbg(MBBI, B);
@@ -2617,7 +2617,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
     if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) {
       // Check that the update value is within our red zone limit (which may be
       // zero).
-      if (MemAcessBeforeSPPreInc && MBBI->getOperand(2).getImm() > RedZoneSize)
+      if (MemAccessBeforeSPPreInc && MBBI->getOperand(2).getImm() > RedZoneSize)
         return E;
       return MBBI;
     }
@@ -2648,7 +2648,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
     // case we need to validate later that the update amount respects the red
     // zone.
     if (BaseRegSP && MBBI->mayLoadOrStore())
-      MemAcessBeforeSPPreInc = true;
+      MemAccessBeforeSPPreInc = true;
   } while (MBBI != B && Count < Limit);
   return E;
 }
@@ -2745,7 +2745,7 @@ bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
   if (!TII->isCandidateToMergeOrPair(MI))
     return false;
 
-  // Look ahead up to LdStLimit instructions for a mergable instruction.
+  // Look ahead up to LdStLimit instructions for a mergeable instruction.
   LdStPairFlags Flags;
   MachineBasicBlock::iterator MergeMI =
       findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
@@ -2941,7 +2941,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
   AArch64FunctionInfo &AFI = *MBB.getParent()->getInfo<AArch64FunctionInfo>();
 
   bool Modified = false;
-  // Four tranformations to do here:
+  // Four transformations to do here:
   // 1) Find loads that directly read from stores and promote them by
   //    replacing with mov instructions. If the store is wider than the load,
   //    the load will be replaced with a bitfield extract.
diff --git a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
index 66f14b6..d67182d 100644
--- a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
@@ -402,7 +402,7 @@ static bool shouldUseFrameHelper(MachineBasicBlock &MBB,
     InstCount--;
     break;
   case FrameHelperType::PrologFrame: {
-    // Effecitvely no change in InstCount since FpAdjusment is included.
+    // Effectively no change in InstCount since FpAdjustment is included.
     break;
   }
   case FrameHelperType::Epilog:
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index e7a35272..c7ea639 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -302,7 +302,7 @@ def TuneOlympus : SubtargetFeature<"olympus", "ARMProcFamily", "Olympus",
                                    FeatureUseFixedOverScalableIfEqualCost]>;
 
 // Note that cyclone does not fuse AES instructions, but newer apple chips do
-// perform the fusion and cyclone is used by default when targetting apple OSes.
+// perform the fusion and cyclone is used by default when targeting apple OSes.
 def TuneAppleA7  : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
                                     "Apple A7 (the CPU formerly known as Cyclone)", [
                                     FeatureAlternateSExtLoadCVTF32Pattern,
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index b0c69b8..fb472dd 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1218,7 +1218,7 @@ bool AArch64RegisterInfo::getRegAllocationHints(
       //   is valid but { z1, z2, z3, z5 } is not.
       // * One or more of the registers used by FORM_TRANSPOSED_X4 is already
       //   assigned a physical register, which means only checking that a
-      //   consectutive range of free tuple registers exists which includes
+      //   consecutive range of free tuple registers exists which includes
       //   the assigned register.
       //   e.g. in the example above, if { z0, z8 } is already allocated for
       //   %v0, we just need to ensure that { z1, z9 }, { z2, z10 } and
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index d3252ea..61bf87f 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -1849,7 +1849,7 @@ def ZTR : RegisterClass<"AArch64", [untyped], 512, (add ZT0)> {
 // * Tile vectors:
 //
 //   Their representation is similar to regular tiles, but they have an extra
-//   'h' or 'v' to tell how the vector at [reg+offset] is layed out in the tile,
+//   'h' or 'v' to tell how the vector at [reg+offset] is laid out in the tile,
 //   horizontally or vertically.
 //
 //   e.g. za1h.h or za15v.q, which corresponds to vectors in registers ZAH1 and
diff --git a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
index b3159b4..d695f26 100644
--- a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
@@ -147,7 +147,7 @@ struct AArch64SIMDInstrOpt : public MachineFunctionPass {
   };
 
   // A costly instruction is replaced in this work by N efficient instructions
-  // The maximum of N is curently 10 and it is for ST4 case.
+  // The maximum of N is currently 10 and it is for ST4 case.
   static const unsigned MaxNumRepl = 10;
 
   AArch64SIMDInstrOpt() : MachineFunctionPass(ID) {}
diff --git a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
index 91410a5..0dc57f7 100644
--- a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
@@ -449,7 +449,7 @@ void SLSHardeningInserter::convertBLRToBL(
 
   // Now copy the implicit operands from BLR to BL and copy other necessary
   // info.
-  // However, both BLR and BL instructions implictly use SP and implicitly
+  // However, both BLR and BL instructions implicitly use SP and implicitly
   // define LR. Blindly copying implicit operands would result in SP and LR
   // operands to be present multiple times. While this may not be too much of
   // an issue, let's avoid that for cleanliness, by removing those implicit
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 91db6b6..12da015 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4064,8 +4064,10 @@ let Predicates = [HasSVE2_or_SME] in {
 
 let Predicates = [HasSVEAES, HasNonStreamingSVE2_or_SSVE_AES] in {
   // SVE2 crypto destructive binary operations
-  defm AESE_ZZZ_B : sve2_crypto_des_bin_op<0b00, "aese", ZPR8, int_aarch64_sve_aese, nxv16i8>;
-  defm AESD_ZZZ_B : sve2_crypto_des_bin_op<0b01, "aesd", ZPR8, int_aarch64_sve_aesd, nxv16i8>;
+  let isCommutable = 1 in {
+  def AESE_ZZZ_B : sve2_crypto_des_bin_op<0b00, "aese", ZPR8, int_aarch64_sve_aese, nxv16i8>;
+  def AESD_ZZZ_B : sve2_crypto_des_bin_op<0b01, "aesd", ZPR8, int_aarch64_sve_aesd, nxv16i8>;
+  }
 
   // SVE2 crypto unary operations
   defm AESMC_ZZ_B  : sve2_crypto_unary_op<0b0, "aesmc",  int_aarch64_sve_aesmc>;
@@ -4082,7 +4084,7 @@ let Predicates = [HasSVE2SM4] in {
   // SVE2 crypto constructive binary operations
   defm SM4EKEY_ZZZ_S : sve2_crypto_cons_bin_op<0b0, "sm4ekey", ZPR32, int_aarch64_sve_sm4ekey, nxv4i32>;
   // SVE2 crypto destructive binary operations
-  defm SM4E_ZZZ_S : sve2_crypto_des_bin_op<0b10, "sm4e", ZPR32, int_aarch64_sve_sm4e, nxv4i32>;
+  def SM4E_ZZZ_S : sve2_crypto_des_bin_op<0b10, "sm4e", ZPR32, int_aarch64_sve_sm4e, nxv4i32>;
 } // End HasSVE2SM4
 
 let Predicates = [HasSVE2SHA3] in {
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA53.td b/llvm/lib/Target/AArch64/AArch64SchedA53.td
index c714bad..66715b9 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA53.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA53.td
@@ -19,7 +19,7 @@ def CortexA53Model : SchedMachineModel {
   let MicroOpBufferSize = 0; // Explicitly set to zero since A53 is in-order.
   let IssueWidth = 2;        // 2 micro-ops are dispatched per cycle.
   let LoadLatency = 3;       // Optimistic load latency assuming bypass.
-                             // This is overriden by OperandCycles if the
+                             // This is overridden by OperandCycles if the
                              // Itineraries are queried instead.
   let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation
                              // Specification - Instruction Timings"
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
index 39f7077..8d3a455 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
@@ -14,7 +14,9 @@
 //===----------------------------------------------------------------------===//
 
 def NeoverseV2Model : SchedMachineModel {
-  let IssueWidth            =  16; // Micro-ops dispatched at a time.
+  let IssueWidth            =   6; // This value comes from the decode bandwidth
+                                   // and empirical measurements showed that a
+                                   // lower value is better.
   let MicroOpBufferSize     = 320; // Entries in micro-op re-order buffer.
   let LoadLatency           =   4; // Optimistic load latency.
   let MispredictPenalty     =  10; // Extra cycles for mispredicted branch.  NOTE: Copied from N2.
diff --git a/llvm/lib/Target/AArch64/AArch64SchedOryon.td b/llvm/lib/Target/AArch64/AArch64SchedOryon.td
index 09d1af2..5b597b9 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedOryon.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedOryon.td
@@ -187,7 +187,7 @@ def ORYONFP2 : ProcResGroup<[ORYONP14FP2]> {
   let BufferSize = 48;
 }
 
-// Reciprocal, Squre root on FP0.
+// Reciprocal, Square root on FP0.
 def ORYONFP0 : ProcResGroup<[ORYONP12FP0]> {
   let BufferSize = 48;
 }
@@ -701,7 +701,7 @@ def : InstRW<[ORYONWrite_1Cyc_I0123],
                        "^CSNEG(W|X)r", "^CSINC(W|X)r")>;
 
 //---
-//Compare Instruciton
+//Compare Instruction
 //---
 
 // We have CCMP, CCMN as LLVM DAG node
@@ -1512,7 +1512,7 @@ def : InstRW<[ORYONWrite_10Cyc_FP3_RC], (instregex "^FSQRTv.*32$")>;
 def : InstRW<[ORYONWrite_13Cyc_FP3_RC], (instregex "^FSQRTv.*64$")>;
 
 //==========
-// SIMD binary elememt arithmetic instructions
+// SIMD binary element arithmetic instructions
 //==========
 
 def : InstRW<[ORYONWrite_4Cyc_FP0123], (instregex "^FMLAv", "^FMLSv")>;
@@ -1568,7 +1568,7 @@ def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^ADDPv", "^FADDPv",
                                                    "^(FMAX|FMIN)(NMP|P)v",
                                                    "^(S|U)(MIN|MAX)Pv")>;
 //==========
-// SIMD dot prodcut instructions
+// SIMD dot product instructions
 //==========
 
 def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^(U|S)DOTv")>;
@@ -1581,7 +1581,7 @@ def : InstRW<[ORYONWrite_2Cyc_FP0123], (instrs TBLv8i8One, TBLv16i8One,
                                                 TBXv8i8One, TBXv16i8One,
                                                 TBLv8i8Two, TBLv16i8Two)>;
 
-// TBL 3-reg/4-reg, 3uops, throughtput=4/3=1.33 latency=4
+// TBL 3-reg/4-reg, 3uops, throughput=4/3=1.33 latency=4
 def : InstRW<[ORYONWrite_4Cyc_FP0123_FP0123_FP0123_RC],
             (instrs TBLv8i8Three, TBLv16i8Three,
                     TBLv8i8Four, TBLv16i8Four)>;
diff --git a/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp b/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp
index 96707f2..a591ba9 100644
--- a/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp
@@ -13,7 +13,7 @@
 // register. That taint register can then be used to mask off registers with
 // sensitive data when executing under miss-speculation, a.k.a. "transient
 // execution".
-// This pass is aimed at mitigating against SpectreV1-style vulnarabilities.
+// This pass is aimed at mitigating against SpectreV1-style vulnerabilities.
 //
 // It also implements speculative load hardening, i.e. using the taint register
 // to automatically mask off loaded data.
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index f5ffc72..f95b0fa 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/RegisterBankInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/TargetParser/Triple.h"
 
 #define GET_SUBTARGETINFO_HEADER
 #include "AArch64GenSubtargetInfo.inc"
@@ -33,7 +34,6 @@
 namespace llvm {
 class GlobalValue;
 class StringRef;
-class Triple;
 
 class AArch64Subtarget final : public AArch64GenSubtargetInfo {
 public:
@@ -406,7 +406,7 @@ public:
   }
 
   // Return the known bit length of SVE data registers. A value of 0 means the
-  // length is unkown beyond what's implied by the architecture.
+  // length is unknown beyond what's implied by the architecture.
   unsigned getSVEVectorSizeInBits() const {
     assert(isSVEorStreamingSVEAvailable() &&
            "Tried to get SVE vector length without SVE support!");
diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index 8f6c593..1f3d619 100644
--- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -941,7 +941,7 @@ defm : TLBI<"RVAE3OS",      0b110, 0b1000, 0b0101, 0b001>;
 defm : TLBI<"RVALE3OS",     0b110, 0b1000, 0b0101, 0b101>;
 } //FeatureTLB_RMI
 
-// Armv9-A Realm Management Extention TLBI Instructions
+// Armv9-A Realm Management Extension TLBI Instructions
 let Requires = ["AArch64::FeatureRME"] in {
 defm : TLBI<"RPAOS",        0b110, 0b1000, 0b0100, 0b011>;
 defm : TLBI<"RPALOS",       0b110, 0b1000, 0b0100, 0b111>;
@@ -1696,7 +1696,7 @@ def : RWSysReg<"PRLAR_EL2",        0b11, 0b100, 0b0110, 0b1000, 0b001>;
 
 foreach n = 1-15 in {
 foreach x = 1-2 in {
-//Direct acces to Protection Region Base Address Register for n th MPU region
+//Direct access to Protection Region Base Address Register for n th MPU region
   def : RWSysReg<!strconcat("PRBAR"#n, "_EL"#x),
     0b11, 0b000, 0b0110, 0b1000, 0b000>{
     let Encoding{5-2} = n;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 68aec80..f8013ac 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1167,7 +1167,7 @@ struct SVEIntrinsicInfo {
   }
 
   // NOTE: Whilst not limited to only inactive lanes, the common use case is:
-  // inactiveLanesAreZerod =
+  // inactiveLanesAreZeroed =
   //     resultIsZeroInitialized() && inactiveLanesAreUnused()
   bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
 
@@ -3958,7 +3958,7 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
     scalarization of the division operation.
     2. Constant divisors, either negative in whole or partially, don't result in
     significantly different codegen as compared to positive constant divisors.
-    So, we don't consider negative divisors seperately.
+    So, we don't consider negative divisors separately.
     3. If the codegen is significantly different with SVE, it has been indicated
     using comments at appropriate places.
 
@@ -3980,7 +3980,7 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
 
     other sdiv/srem cases:
     -------------------------------------------------------------------------
-    commom codegen            | + srem     | + sdiv     | pow-of-2  | Type
+    common codegen            | + srem     | + sdiv     | pow-of-2  | Type
     -------------------------------------------------------------------------
     smulh + asr + add + add   | -          | -          | N         | i64
     smull + lsr + add + add   | -          | -          | N         | i32
@@ -5921,7 +5921,7 @@ static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
       !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
     return false;
 
-  // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
+  // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
   // it is not checked as an extract below.
   if (AllowSplat && isSplatShuffle(Op1))
     S1Op1 = nullptr;
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index ce75c05..2f67ff5 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -1275,7 +1275,7 @@ public:
       RK = RegKind::SVEPredicateAsCounter;
       break;
     default:
-      llvm_unreachable("Unsupport register class");
+      llvm_unreachable("Unsupported register class");
     }
 
     return (Kind == k_Register && Reg.Kind == RK) &&
@@ -1302,7 +1302,7 @@ public:
       RK = RegKind::SVEPredicateVector;
       break;
     default:
-      llvm_unreachable("Unsupport register class");
+      llvm_unreachable("Unsupported register class");
     }
 
     return (Kind == k_Register && Reg.Kind == RK) &&
@@ -5405,7 +5405,7 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc,
 
   // A prefix only applies to the instruction following it.  Here we extract
   // prefix information for the next instruction before validating the current
-  // one so that in the case of failure we don't erronously continue using the
+  // one so that in the case of failure we don't erroneously continue using the
   // current prefix.
   PrefixInfo Prefix = NextPrefix;
   NextPrefix = PrefixInfo::CreateFromInst(Inst, MCID.TSFlags);
@@ -5417,7 +5417,7 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc,
       (Inst.getOpcode() != AArch64::BRK) &&
       (Inst.getOpcode() != AArch64::HLT)) {
 
-    // Prefixed intructions must have a destructive operand.
+    // Prefixed instructions must have a destructive operand.
     if ((MCID.TSFlags & AArch64::DestructiveInstTypeMask) ==
         AArch64::NotDestructive)
       return Error(IDLoc, "instruction is unpredictable when following a"
@@ -6407,7 +6407,7 @@ bool AArch64AsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                                MCStreamer &Out,
                                                uint64_t &ErrorInfo,
                                                bool MatchingInlineAsm) {
-  assert(!Operands.empty() && "Unexpect empty operand list!");
+  assert(!Operands.empty() && "Unexpected empty operand list!");
   AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[0]);
   assert(Op.isToken() && "Leading operand should always be a mnemonic!");
 
@@ -7942,7 +7942,7 @@ bool AArch64AsmParser::parseDirectiveAeabiSubSectionHeader(SMLoc L) {
   }
 
   std::unique_ptr<MCELFStreamer::AttributeSubSection> SubsectionExists =
-      getTargetStreamer().getAtributesSubsectionByName(SubsectionName);
+      getTargetStreamer().getAttributesSubsectionByName(SubsectionName);
 
   // Consume the first parameter (optionality parameter)
   AArch64BuildAttributes::SubsectionOptional IsOptional;
@@ -8038,7 +8038,7 @@ bool AArch64AsmParser::parseDirectiveAeabiSubSectionHeader(SMLoc L) {
     return true;
   }
 
-  getTargetStreamer().emitAtributesSubsection(SubsectionName, IsOptional, Type);
+  getTargetStreamer().emitAttributesSubsection(SubsectionName, IsOptional, Type);
 
   return false;
 }
@@ -8050,7 +8050,7 @@ bool AArch64AsmParser::parseDirectiveAeabiAArch64Attr(SMLoc L) {
   MCAsmParser &Parser = getParser();
 
   std::unique_ptr<MCELFStreamer::AttributeSubSection> ActiveSubsection =
-      getTargetStreamer().getActiveAtributesSubsection();
+      getTargetStreamer().getActiveAttributesSubsection();
   if (nullptr == ActiveSubsection) {
     Error(Parser.getTok().getLoc(),
           "no active subsection, build attribute can not be added");
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
index 3f22292..f2528bc 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
@@ -41,12 +41,12 @@ getMachOSpecifier(uint64_t LLVMDisassembler_VariantKind) {
   }
 }
 
-/// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
+/// tryAddingSymbolicOperand - tryAddingSymbolicOperand tries to add a symbolic
 /// operand in place of the immediate Value in the MCInst.  The immediate
 /// Value has not had any PC adjustment made by the caller. If the instruction
 /// is a branch that adds the PC to the immediate Value then isBranch is
 /// Success, else Fail. If GetOpInfo is non-null, then it is called to get any
-/// symbolic information at the Address for this instrution.  If that returns
+/// symbolic information at the Address for this instruction.  If that returns
 /// non-zero then the symbolic information it returns is used to create an
 /// MCExpr and that is added as an operand to the MCInst.  If GetOpInfo()
 /// returns zero and isBranch is Success then a symbol look up for
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index fd41497..010d0aaa 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -282,7 +282,7 @@ struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler {
 
   /// We need to fixup the reported store size for certain value types because
   /// we invert the interpretation of ValVT and LocVT in certain cases. This is
-  /// for compatability with the DAG call lowering implementation, which we're
+  /// for compatibility with the DAG call lowering implementation, which we're
   /// currently building on top of.
   LLT getStackValueStoreType(const DataLayout &DL, const CCValAssign &VA,
                              ISD::ArgFlagsTy Flags) const override {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index e0c693b..51b4232 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -22,6 +22,7 @@
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
+#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
@@ -6630,7 +6631,7 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
     Register SizeUse = I.getOperand(4).getReg();
 
     // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one.
-    // Therefore an additional virtual register is requried for the updated size
+    // Therefore an additional virtual register is required for the updated size
     // operand. This value is not accessible via the semantics of the intrinsic.
     Register SizeDef = MRI.createGenericVirtualRegister(LLT::scalar(64));
 
@@ -7418,7 +7419,7 @@ AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
     unsigned Scale = Log2_32(SizeInBytes);
     int64_t ImmOff = ValAndVReg->Value.getSExtValue();
 
-    // Skip immediates that can be selected in the load/store addresing
+    // Skip immediates that can be selected in the load/store addressing
     // mode.
     if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 &&
         ImmOff < (0x1000 << Scale))
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 80e098e..53c7a00 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -399,6 +399,26 @@ void AArch64RegisterBankInfo::applyMappingImpl(
     MI.getOperand(1).setReg(ConstReg);
     return applyDefaultMapping(OpdMapper);
   }
+  case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
+    // SDAG will promote a 64bit G_EXTRACT_VECTOR_ELT to 128 to reduce the
+    // number of duplicate lane-extract patterns needed. Do the same here so
+    // that selection will operate on the larger vectors.
+    Register Src = MI.getOperand(1).getReg();
+    LLT SrcTy = MRI.getType(Src);
+    assert(SrcTy.getSizeInBits() == 64 && "Expected 64-bit source vector");
+    LLT DstTy = SrcTy.multiplyElements(2);
+    Builder.setInsertPt(*MI.getParent(), MI.getIterator());
+    auto Undef = Builder.buildUndef(SrcTy);
+    auto Concat = Builder.buildConcatVectors(DstTy, {Src, Undef.getReg(0)});
+    MRI.setRegBank(Undef.getReg(0), getRegBank(AArch64::FPRRegBankID));
+    MRI.setRegBank(Concat.getReg(0), getRegBank(AArch64::FPRRegBankID));
+    for (MachineInstr &Ext :
+         make_early_inc_range(MRI.use_nodbg_instructions(Src))) {
+      if (Ext.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT)
+        Ext.getOperand(1).setReg(Concat.getReg(0));
+    }
+    return applyDefaultMapping(OpdMapper);
+  }
   default:
     llvm_unreachable("Don't know how to handle that operation");
   }
@@ -490,7 +510,7 @@ static bool isFPIntrinsic(const MachineRegisterInfo &MRI,
   }
 }
 
-bool AArch64RegisterBankInfo::isPHIWithFPContraints(
+bool AArch64RegisterBankInfo::isPHIWithFPConstraints(
     const MachineInstr &MI, const MachineRegisterInfo &MRI,
     const TargetRegisterInfo &TRI, const unsigned Depth) const {
   if (!MI.isPHI() || Depth > MaxFPRSearchDepth)
@@ -500,7 +520,7 @@ bool AArch64RegisterBankInfo::isPHIWithFPContraints(
                 [&](const MachineInstr &UseMI) {
                   if (onlyUsesFP(UseMI, MRI, TRI, Depth + 1))
                     return true;
-                  return isPHIWithFPContraints(UseMI, MRI, TRI, Depth + 1);
+                  return isPHIWithFPConstraints(UseMI, MRI, TRI, Depth + 1);
                 });
 }
 
@@ -897,7 +917,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
                  // Int->FP conversion operations are also captured in
                  // onlyDefinesFP().
 
-                 if (isPHIWithFPContraints(UseMI, MRI, TRI))
+                 if (isPHIWithFPConstraints(UseMI, MRI, TRI))
                    return true;
 
                  return onlyUsesFP(UseMI, MRI, TRI) ||
@@ -1014,14 +1034,20 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     }
     break;
   }
-  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
+  case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
     // Destination and source need to be FPRs.
     OpRegBankIdx[0] = PMI_FirstFPR;
     OpRegBankIdx[1] = PMI_FirstFPR;
-
-    // Index needs to be a GPR.
+    // Index needs to be a GPR constant.
     OpRegBankIdx[2] = PMI_FirstGPR;
+    // SDAG will promote a 64bit G_EXTRACT_VECTOR_ELT to 128 to reduce the
+    // number of duplicate lane-extract patterns needed. Do the same here so
+    // that selection will operate on the larger vectors.
+    LLT Ty = MRI.getType(MI.getOperand(1).getReg());
+    if (!Ty.isScalable() && Ty.getSizeInBits() == 64)
+      MappingID = CustomMappingID;
     break;
+  }
   case TargetOpcode::G_INSERT_VECTOR_ELT:
     OpRegBankIdx[0] = PMI_FirstFPR;
     OpRegBankIdx[1] = PMI_FirstFPR;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
index 941499b..3abbc1b 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
@@ -122,7 +122,7 @@ class AArch64RegisterBankInfo final : public AArch64GenRegisterBankInfo {
 
   /// \returns true if \p MI is a PHI that its def is used by
   /// any instruction that onlyUsesFP.
-  bool isPHIWithFPContraints(const MachineInstr &MI,
+  bool isPHIWithFPConstraints(const MachineInstr &MI,
                              const MachineRegisterInfo &MRI,
                              const TargetRegisterInfo &TRI,
                              unsigned Depth = 0) const;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 03cbd27..f542592 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -97,7 +97,7 @@ static inline unsigned getShiftValue(unsigned Imm) {
 ///   {5-0}  = imm
 static inline unsigned getShifterImm(AArch64_AM::ShiftExtendType ST,
                                      unsigned Imm) {
-  assert((Imm & 0x3f) == Imm && "Illegal shifted immedate value!");
+  assert((Imm & 0x3f) == Imm && "Illegal shifted immediate value!");
   unsigned STEnc = 0;
   switch (ST) {
   default:  llvm_unreachable("Invalid shift requested");
@@ -169,7 +169,7 @@ inline unsigned getExtendEncoding(AArch64_AM::ShiftExtendType ET) {
 ///   {2-0}  = imm3
 static inline unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET,
                                          unsigned Imm) {
-  assert((Imm & 0x7) == Imm && "Illegal shifted immedate value!");
+  assert((Imm & 0x7) == Imm && "Illegal shifted immediate value!");
   return (getExtendEncoding(ET) << 3) | (Imm & 0x7);
 }
 
@@ -594,7 +594,7 @@ static inline bool isAdvSIMDModImmType10(uint64_t Imm) {
 #if defined(_MSC_VER) && _MSC_VER == 1937 && !defined(__clang__) &&            \
     defined(_M_ARM64)
   // The MSVC compiler 19.37 for ARM64 has an optimization bug that
-  // causes an incorrect behavior with the orignal version. Work around
+  // causes an incorrect behavior with the original version. Work around
   // by using a slightly different variation.
   // https://developercommunity.visualstudio.com/t/C-ARM64-compiler-optimization-bug/10481261
   constexpr uint64_t Mask = 0xFFULL;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 6ee12cc..f214437 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -231,7 +231,7 @@ class AArch64TargetAsmStreamer : public AArch64TargetStreamer {
     OS << "\n";
   }
 
-  void emitAtributesSubsection(
+  void emitAttributesSubsection(
       StringRef SubsectionName,
       AArch64BuildAttributes::SubsectionOptional Optional,
       AArch64BuildAttributes::SubsectionType ParameterType) override {
@@ -278,7 +278,7 @@ class AArch64TargetAsmStreamer : public AArch64TargetStreamer {
        << ", " << ParameterStr;
     // Keep the data structure consistent with the case of ELF emission
     // (important for llvm-mc asm parsing)
-    AArch64TargetStreamer::emitAtributesSubsection(SubsectionName, Optional,
+    AArch64TargetStreamer::emitAttributesSubsection(SubsectionName, Optional,
                                                    ParameterType);
     OS << "\n";
   }
@@ -433,10 +433,10 @@ AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() {
   return static_cast<AArch64ELFStreamer &>(Streamer);
 }
 
-void AArch64TargetELFStreamer::emitAtributesSubsection(
+void AArch64TargetELFStreamer::emitAttributesSubsection(
     StringRef VendorName, AArch64BuildAttributes::SubsectionOptional IsOptional,
     AArch64BuildAttributes::SubsectionType ParameterType) {
-  AArch64TargetStreamer::emitAtributesSubsection(VendorName, IsOptional,
+  AArch64TargetStreamer::emitAttributesSubsection(VendorName, IsOptional,
                                                  ParameterType);
 }
 
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index 5552cea..9d9e23e 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -216,7 +216,7 @@ void AArch64InstPrinter::printInst(const MCInst *MI, uint64_t Address,
 
     if ((Op2.getReg() == AArch64::WZR || Op2.getReg() == AArch64::XZR) &&
         (ImmR == 0 || ImmS < ImmR) && STI.hasFeature(AArch64::HasV8_2aOps)) {
-      // BFC takes precedence over its entire range, sligtly differently to BFI.
+      // BFC takes precedence over its entire range, slightly differently to BFI.
       int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32;
       int LSB = (BitWidth - ImmR) % BitWidth;
       int Width = ImmS + 1;
@@ -2051,7 +2051,7 @@ void AArch64InstPrinter::printImm8OptLsl(const MCInst *MI, unsigned OpNum,
   unsigned UnscaledVal = MI->getOperand(OpNum).getImm();
   unsigned Shift = MI->getOperand(OpNum + 1).getImm();
   assert(AArch64_AM::getShiftType(Shift) == AArch64_AM::LSL &&
-         "Unexepected shift type!");
+         "Unexpected shift type!");
 
   // #0 lsl #8 is never pretty printed
   if ((UnscaledVal == 0) && (AArch64_AM::getShiftValue(Shift) != 0)) {
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index dd3ecb4..b7959e0 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -436,7 +436,7 @@ public:
       // architecturally defined to zero extend the upper 32 bits on a write.
       if (GPR32RC.contains(Reg))
         return true;
-      // SIMD&FP instructions operating on scalar data only acccess the lower
+      // SIMD&FP instructions operating on scalar data only access the lower
       // bits of a register, the upper bits are zero extended on a write. For
       // SIMD vector registers smaller than 128-bits, the upper 64-bits of the
       // register are zero extended on a write.
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index c5fb7f5..d742b28 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -29,7 +29,7 @@ static cl::opt<bool> MarkBTIProperty(
     cl::init(false));
 
 //
-// AArch64TargetStreamer Implemenation
+// AArch64TargetStreamer Implementation
 //
 AArch64TargetStreamer::AArch64TargetStreamer(MCStreamer &S)
     : MCTargetStreamer(S), ConstantPools(new AssemblerConstantPools()) {}
@@ -153,14 +153,14 @@ MCTargetStreamer *llvm::createAArch64NullTargetStreamer(MCStreamer &S) {
   return new AArch64TargetStreamer(S);
 }
 
-void AArch64TargetStreamer::emitAtributesSubsection(
+void AArch64TargetStreamer::emitAttributesSubsection(
     StringRef VendorName, AArch64BuildAttributes::SubsectionOptional IsOptional,
     AArch64BuildAttributes::SubsectionType ParameterType) {
 
   // If exists, return.
   for (MCELFStreamer::AttributeSubSection &SubSection : AttributeSubSections) {
     if (VendorName == SubSection.VendorName) {
-      activateAtributesSubsection(VendorName);
+      activateAttributesSubsection(VendorName);
       return;
     }
   }
@@ -170,11 +170,11 @@ void AArch64TargetStreamer::emitAtributesSubsection(
   AttSubSection.IsOptional = IsOptional;
   AttSubSection.ParameterType = ParameterType;
   AttributeSubSections.push_back(AttSubSection);
-  activateAtributesSubsection(VendorName);
+  activateAttributesSubsection(VendorName);
 }
 
 std::unique_ptr<MCELFStreamer::AttributeSubSection>
-AArch64TargetStreamer::getActiveAtributesSubsection() {
+AArch64TargetStreamer::getActiveAttributesSubsection() {
   for (MCELFStreamer::AttributeSubSection &SubSection : AttributeSubSections) {
     if (SubSection.IsActive) {
       return std::make_unique<MCELFStreamer::AttributeSubSection>(SubSection);
@@ -184,7 +184,7 @@ AArch64TargetStreamer::getActiveAtributesSubsection() {
 }
 
 std::unique_ptr<MCELFStreamer::AttributeSubSection>
-AArch64TargetStreamer::getAtributesSubsectionByName(StringRef Name) {
+AArch64TargetStreamer::getAttributesSubsectionByName(StringRef Name) {
   for (MCELFStreamer::AttributeSubSection &SubSection : AttributeSubSections) {
     if (Name == SubSection.VendorName) {
       return std::make_unique<MCELFStreamer::AttributeSubSection>(SubSection);
@@ -238,7 +238,7 @@ void AArch64TargetStreamer::emitAttribute(StringRef VendorName, unsigned Tag,
               "not exist");
 }
 
-void AArch64TargetStreamer::activateAtributesSubsection(StringRef VendorName) {
+void AArch64TargetStreamer::activateAttributesSubsection(StringRef VendorName) {
   for (MCELFStreamer::AttributeSubSection &SubSection : AttributeSubSections) {
     if (VendorName == SubSection.VendorName) {
       SubSection.IsActive = true;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
index aa26acd..d878f1e 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -102,16 +102,16 @@ public:
 
   /// Build attributes implementation
   virtual void
-  emitAtributesSubsection(StringRef VendorName,
+  emitAttributesSubsection(StringRef VendorName,
                           AArch64BuildAttributes::SubsectionOptional IsOptional,
                           AArch64BuildAttributes::SubsectionType ParameterType);
   virtual void emitAttribute(StringRef VendorName, unsigned Tag, unsigned Value,
                              std::string String);
-  void activateAtributesSubsection(StringRef VendorName);
+  void activateAttributesSubsection(StringRef VendorName);
   std::unique_ptr<MCELFStreamer::AttributeSubSection>
-  getActiveAtributesSubsection();
+  getActiveAttributesSubsection();
   std::unique_ptr<MCELFStreamer::AttributeSubSection>
-  getAtributesSubsectionByName(StringRef Name);
+  getAttributesSubsectionByName(StringRef Name);
   void
   insertAttributeInPlace(const MCELFStreamer::AttributeItem &Attr,
                          MCELFStreamer::AttributeSubSection &AttSubSection);
@@ -129,7 +129,7 @@ private:
   MCSection *AttributeSection = nullptr;
 
   /// Build attributes implementation
-  void emitAtributesSubsection(
+  void emitAttributesSubsection(
       StringRef VendorName,
       AArch64BuildAttributes::SubsectionOptional IsOptional,
       AArch64BuildAttributes::SubsectionType ParameterType) override;
diff --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
index fc8bef4..bd28716 100644
--- a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
@@ -225,7 +225,7 @@ bool SMEPeepholeOpt::optimizeStartStopPairs(
 }
 
 // Using the FORM_TRANSPOSED_REG_TUPLE pseudo can improve register allocation
-// of multi-vector intrinsics. However, the psuedo should only be emitted if
+// of multi-vector intrinsics. However, the pseudo should only be emitted if
 // the input registers of the REG_SEQUENCE are copy nodes where the source
 // register is in a StridedOrContiguous class. For example:
 //
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index c567137..56d124d 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -9235,11 +9235,12 @@ multiclass sve2_crypto_cons_bin_op<bit opc, string asm, ZPRRegOp zprty,
   def : SVE_2_Op_Pat<vt, op, vt, vt, !cast<Instruction>(NAME)>;
 }
 
-class sve2_crypto_des_bin_op<bits<2> opc, string asm, ZPRRegOp zprty>
+class sve2_crypto_des_bin_op<bits<2> opc, string asm, ZPRRegOp zprty,
+                             SDPatternOperator op, ValueType vt>
 : I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm),
   asm, "\t$Zdn, $_Zdn, $Zm",
   "",
-  []>, Sched<[]> {
+  [(set (vt zprty:$Zdn), (op (vt zprty:$_Zdn), (vt zprty:$Zm)))]>, Sched<[]> {
   bits<5> Zdn;
   bits<5> Zm;
   let Inst{31-17} = 0b010001010010001;
@@ -9253,12 +9254,6 @@ class sve2_crypto_des_bin_op<bits<2> opc, string asm, ZPRRegOp zprty>
   let hasSideEffects = 0;
 }
 
-multiclass sve2_crypto_des_bin_op<bits<2> opc, string asm, ZPRRegOp zprty,
-                                  SDPatternOperator op, ValueType vt> {
-  def NAME : sve2_crypto_des_bin_op<opc, asm, zprty>;
-  def : SVE_2_Op_Pat<vt, op, vt, vt, !cast<Instruction>(NAME)>;
-}
-
 class sve2_crypto_unary_op<bit opc, string asm, ZPRRegOp zprty>
 : I<(outs zprty:$Zdn), (ins zprty:$_Zdn),
   asm, "\t$Zdn, $_Zdn",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index f4f391e..9be8821 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -462,6 +462,8 @@ static bool isTriviallyUniform(const Use &U) {
   Value *V = U.get();
   if (isa<Constant>(V))
     return true;
+  if (const auto *A = dyn_cast<Argument>(V))
+    return AMDGPU::isArgPassedInSGPR(A);
   if (const auto *II = dyn_cast<IntrinsicInst>(V)) {
     if (!AMDGPU::isIntrinsicAlwaysUniform(II->getIntrinsicID()))
       return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 5daa39ab..e8dff85 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4263,7 +4263,7 @@ static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
   if (MI.getOpcode() != TargetOpcode::G_XOR)
     return false;
   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
-  return ConstVal && *ConstVal == -1;
+  return ConstVal == -1;
 }
 
 // Return the use branch instruction, otherwise null if the usage is invalid.
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 12716bd..88b6948 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1537,17 +1537,13 @@ multiclass ScratchFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Valu
 let OtherPredicates = [HasFlatAddressSpace] in {
 
 def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>;
 def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>;
 def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>;
 def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>;
 def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i32>;
 def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
 def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
 def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
 def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
 def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
 def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
@@ -1560,8 +1556,14 @@ let True16Predicate = p in {
   def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
   def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
   def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
+  def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>;
+  def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>;
+  def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>;
+  def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
   def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
   def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
+  def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i16>;
+  def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>;
 }
 
 let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in {
@@ -1569,8 +1571,14 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi
   def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
   def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
   def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
+  def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>;
+  def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>;
+  def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>;
+  def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>;
   def : FlatStorePat <FLAT_STORE_BYTE_t16, truncstorei8_flat, i16>;
   def : FlatStorePat <FLAT_STORE_SHORT_t16, store_flat, i16>;
+  def : FlatStorePat <FLAT_STORE_BYTE_t16, atomic_store_8_flat, i16>;
+  def : FlatStorePat <FLAT_STORE_SHORT_t16, atomic_store_16_flat, i16>;
 } // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts
 
 def : FlatLoadPat <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>;
@@ -1599,9 +1607,7 @@ def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, vt>;
 def : FlatStorePat <FLAT_STORE_DWORD, atomic_store_32_flat, i32>;
 def : FlatStorePat <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>;
 def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i32>;
-def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i16>;
 def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
-def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>;
 
 foreach as = [ "flat", "global" ] in {
 defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>;
@@ -1680,9 +1686,7 @@ let OtherPredicates = [HasFlatGlobalInsts] in {
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_aext_8_global, i32>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_zext_8_global, i32>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_aext_16_global, i32>;
-defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_nonext_16_global, i16>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i32>;
-defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, atomic_load_sext_8_global, i32>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i32>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, i32>;
@@ -1702,6 +1706,8 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, load_global, i16>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_aext_8_global, i16>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_zext_8_global, i16>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, atomic_load_sext_8_global, i16>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_nonext_16_global, i16>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>;
 }
 
 let OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in {
@@ -1712,8 +1718,12 @@ defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", load_global, i16>;
 defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", atomic_load_aext_8_global, i16>;
 defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", atomic_load_zext_8_global, i16>;
 defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", atomic_load_sext_8_global, i16>;
+defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", atomic_load_nonext_16_global, i16>;
+defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", atomic_load_zext_16_global, i16>;
 defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_BYTE", truncstorei8_global, i16>;
 defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_SHORT", store_global, i16>;
+defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_BYTE", atomic_store_8_global, i16>;
+defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_SHORT", atomic_store_16_global, i16>;
 } // end OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts
 
 foreach vt = Reg32Types.types in {
@@ -1747,6 +1757,8 @@ foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
 let OtherPredicates = [HasFlatGlobalInsts], True16Predicate =  p in {
 defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i16>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, store_global, i16>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i16>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i16>;
 }
 
 let OtherPredicates = [HasD16LoadStore] in {
@@ -1772,9 +1784,7 @@ defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>
 }
 
 defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i32>;
-defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i16>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>;
-defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i16>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>;
 
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 4f51feb..8299c0d 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1675,6 +1675,12 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
       LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
                         << static_cast<int>(Fold.UseOpNo) << " of "
                         << *Fold.UseMI);
+
+      if (Fold.isImm() && tryConstantFoldOp(Fold.UseMI)) {
+        LLVM_DEBUG(dbgs() << "Constant folded " << *Fold.UseMI);
+        Changed = true;
+      }
+
     } else if (Fold.Commuted) {
       // Restoring instruction's original operand order if fold has failed.
       TII->commuteInstruction(*Fold.UseMI, false);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 083345d..2c20475 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1973,7 +1973,8 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
                 RegisterOperand Src2RC, int NumSrcArgs,
                 bit HasClamp, bit HasModifiers, bit HasSrc2Mods, bit HasOMod,
                 Operand Src0Mod, Operand Src1Mod, Operand Src2Mod,
-                bit HasFP8ByteSel = 0, bit HasFP8DstByteSel = 0> {
+                bit HasFP8ByteSel = 0, bit HasFP8DstByteSel = 0,
+                bit HasBitOp3 = 0> {
   dag src0 = !if(!ge(NumSrcArgs, 1),
                  !if (HasModifiers,
                       (ins Src0Mod:$src0_modifiers, Src0RC:$src0),
@@ -1999,21 +2000,23 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
                     !con(!if(HasFP8DstByteSel, (ins VGPR_32:$vdst_in), (ins)),
                          (ins ByteSel0:$byte_sel)),
                     (ins));
+  dag bitop3 = !if(HasBitOp3, (ins bitop3_0:$bitop3), (ins));
 
-  dag ret = !con(src0, src1, src2, clamp, omod, bytesel);
+  dag ret = !con(src0, src1, src2, clamp, omod, bytesel, bitop3);
 }
 
 class getInsVOP3Base<RegisterOperand Src0RC, RegisterOperand Src1RC,
                 RegisterOperand Src2RC, int NumSrcArgs,
                 bit HasClamp, bit HasModifiers, bit HasSrc2Mods, bit HasOMod,
                 Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOpSel,
-                bit HasFP8ByteSel = 0, bit HasFP8DstByteSel = 0> {
+                bit HasFP8ByteSel = 0, bit HasFP8DstByteSel = 0, bit HasBitOp3 = 0> {
   // getInst64 handles clamp and omod. implicit mutex between vop3p and omod
   dag base = getIns64 <Src0RC, Src1RC, Src2RC, NumSrcArgs,
                 HasClamp, HasModifiers, HasSrc2Mods, HasOMod,
                 Src0Mod, Src1Mod, Src2Mod, HasFP8ByteSel, HasFP8DstByteSel>.ret;
   dag opsel = (ins op_sel0:$op_sel);
-  dag ret = !con(base, !if(HasOpSel, opsel, (ins)));
+  dag bitop3 = (ins bitop3_0:$bitop3);
+  dag ret = !con(base, !if(HasBitOp3, bitop3, (ins)), !if(HasOpSel, opsel, (ins)));
 }
 
 class getInsVOP3P <RegisterOperand Src0RC, RegisterOperand Src1RC,
@@ -2034,11 +2037,13 @@ class getInsVOP3P <RegisterOperand Src0RC, RegisterOperand Src1RC,
 class getInsVOP3OpSel <RegisterOperand Src0RC, RegisterOperand Src1RC,
                        RegisterOperand Src2RC, int NumSrcArgs,
                        bit HasClamp, bit HasOMod,
-                       Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
+                       Operand Src0Mod, Operand Src1Mod, Operand Src2Mod,
+                       bit HasFP8ByteSel = 0, bit HasFP8DstByteSel = 0, bit HasBitOp3 = 0> {
   dag ret = getInsVOP3Base<Src0RC, Src1RC,
                     Src2RC, NumSrcArgs,
                     HasClamp, 1/*HasModifiers*/, 1/*HasSrc2Mods*/, HasOMod,
-                    Src0Mod, Src1Mod, Src2Mod, /*HasOpSel=*/1>.ret;
+                    Src0Mod, Src1Mod, Src2Mod, /*HasOpSel=*/1,
+                    HasFP8ByteSel, HasFP8DstByteSel, HasBitOp3>.ret;
 }
 
 class getInsDPPBase <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand Src1RC,
@@ -2244,7 +2249,9 @@ class getAsmVOP3OpSel <int NumSrcArgs,
                        bit HasOMod,
                        bit Src0HasMods,
                        bit Src1HasMods,
-                       bit Src2HasMods> {
+                       bit Src2HasMods,
+                       bit HasByteSel = 0,
+                       bit HasBitOp3 = 0> {
   string dst = "$vdst";
 
   string isrc0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
@@ -2263,9 +2270,11 @@ class getAsmVOP3OpSel <int NumSrcArgs,
   string src1 = !if(Src1HasMods, fsrc1, isrc1);
   string src2 = !if(Src2HasMods, fsrc2, isrc2);
 
+  string bytesel = !if(HasByteSel, "$byte_sel", "");
   string clamp = !if(HasClamp, "$clamp", "");
   string omod = !if(HasOMod, "$omod", "");
-  string ret = dst#", "#src0#src1#src2#"$op_sel"#clamp#omod;
+  string bitop3 = !if(HasBitOp3, "$bitop3", "");
+  string ret = dst#", "#src0#src1#src2#bitop3#"$op_sel"#bytesel#clamp#omod;
 }
 
 class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
@@ -2297,7 +2306,7 @@ class getAsmVOP3Base <int NumSrcArgs, bit HasDst, bit HasClamp,
                        bit HasOpSel, bit HasOMod, bit IsVOP3P,
                        bit HasNeg, bit Src0HasMods,
                        bit Src1HasMods, bit Src2HasMods, ValueType DstVT = i32,
-                       bit HasByteSel = 0> {
+                       bit HasByteSel = 0, bit HasBitOp3 = 0> {
   string dst = !if(HasDst,
                    !if(!eq(DstVT.Size, 1),
                        "$sdst",
@@ -2320,6 +2329,7 @@ class getAsmVOP3Base <int NumSrcArgs, bit HasDst, bit HasClamp,
   string src2 = !if(Src2HasMods, src2mods, src2nomods);
   string opsel = !if(HasOpSel, "$op_sel", "");
   string bytesel = !if(HasByteSel, "$byte_sel", "");
+  string bitop3 = !if(HasBitOp3, "$bitop3", "");
   string 3PMods = !if(IsVOP3P,
                       !if(HasOpSel, "$op_sel_hi", "")
                         #!if(HasNeg, "$neg_lo$neg_hi", ""),
@@ -2329,7 +2339,7 @@ class getAsmVOP3Base <int NumSrcArgs, bit HasDst, bit HasClamp,
 
   string ret = dst#!if(!eq(NumSrcArgs,0),
                        "",
-                       !if(HasDst,", ", "")#src0#src1#src2#opsel#bytesel#3PMods#clamp#omod);
+                       !if(HasDst,", ", "")#src0#src1#src2#bitop3#opsel#bytesel#3PMods#clamp#omod);
 }
 
 class getAsmVOP3DPP<string base> {
@@ -2554,6 +2564,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field bit HasFP8DstByteSel = 0;
   field bit HasFP4DstByteSel = 0;
   field bit HasFP8ByteSel = !or(HasFP8SrcByteSel, HasFP8DstByteSel);
+  field bit HasBitOp3 = 0;
 
   field bit HasDst = !ne(DstVT.Value, untyped.Value);
   field bit HasDst32 = HasDst;
@@ -2624,13 +2635,14 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
                              HasClamp, HasModifiers, HasSrc2Mods,
                              HasOMod, Src0Mod, Src1Mod, Src2Mod,
-                             HasFP8ByteSel, HasFP8DstByteSel>.ret;
+                             HasFP8ByteSel, HasFP8DstByteSel, HasBitOp3>.ret;
   field dag InsVOP3P = getInsVOP3P<Src0RC64, Src1RC64, Src2RC64,
                                    NumSrcArgs, HasClamp, HasOpSel, HasNeg,
                                    Src0PackedMod, Src1PackedMod, Src2PackedMod>.ret;
   field dag InsVOP3OpSel = getInsVOP3OpSel<Src0RC64, Src1RC64, Src2RC64,
                                 NumSrcArgs, HasClamp, HasOMod,
-                                Src0Mod, Src1Mod, Src2Mod>.ret;
+                                Src0Mod, Src1Mod, Src2Mod,
+                                HasFP8ByteSel, HasFP8DstByteSel, HasBitOp3>.ret;
   field dag InsDPP = !if(HasExtDPP,
                          getInsDPP<DstRCDPP, Src0DPP, Src1DPP, Src2DPP, NumSrcArgs,
                                    HasModifiers, Src0ModDPP, Src1ModDPP, Src2ModDPP>.ret,
@@ -2643,7 +2655,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   defvar InsVOP3DPPBase = getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP,
                   Src2VOP3DPP, NumSrcArgs, HasClamp, HasModifiers, HasSrc2Mods, HasOMod,
                   Src0ModVOP3DPP, Src1ModVOP3DPP, Src2ModVOP3DPP, HasOpSel,
-                  HasFP8ByteSel, HasFP8DstByteSel>.ret;
+                  HasFP8ByteSel, HasFP8DstByteSel, HasBitOp3>.ret;
   defvar InsVOP3PDPPBase = getInsVOP3P<Src0VOP3DPP, Src1VOP3DPP,
                   Src2VOP3DPP, NumSrcArgs, HasClamp, HasOpSel, HasNeg,
                   Src0ModVOP3DPP, Src1ModVOP3DPP, Src2ModVOP3DPP>.ret;
@@ -2671,8 +2683,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   // the asm operand name via this HasModifiers flag
   field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret;
   field string AsmVOP3Base = getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
-   HasOpSel, HasOMod, IsVOP3P, HasNeg, HasModifiers, HasModifiers,
-   HasModifiers, DstVT, HasFP8ByteSel>.ret;
+   HasOpSel, HasOMod, IsVOP3P, HasNeg, HasSrc0Mods, HasSrc1Mods,
+   HasSrc2Mods, DstVT, HasFP8ByteSel, HasBitOp3>.ret;
   field string Asm64 = AsmVOP3Base;
   field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasNeg, HasClamp, HasOpSel>.ret;
   field string AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs,
@@ -2680,7 +2692,9 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
                                               HasOMod,
                                               HasSrc0FloatMods,
                                               HasSrc1FloatMods,
-                                              HasSrc2FloatMods>.ret;
+                                              HasSrc2FloatMods,
+                                              HasFP8ByteSel,
+                                              HasBitOp3>.ret;
   field string AsmVOP3DPP = getAsmVOP3DPP<AsmVOP3Base>.ret;
   field string AsmVOP3DPP16 = getAsmVOP3DPP16<AsmVOP3Base>.ret;
   field string AsmVOP3DPP8 = getAsmVOP3DPP8<AsmVOP3Base>.ret;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index a2672d7..ef88a37 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1003,18 +1003,7 @@ class VOP3_BITOP3_Profile<VOPProfile pfl, VOP3Features f> : VOP3_Profile<pfl, f>
   let HasClamp = 0;
   let HasOMod = 0;
   let HasModifiers = 0;
-
-  let Ins64 = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
-                            0 /* HasIntClamp */, HasModifiers, HasSrc2Mods,
-                            HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
-                   (ins bitop3_0:$bitop3));
-
-  let InsVOP3OpSel = !con(getInsVOP3Base<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, 0, 1, 1, 0,
-                                         Src0Mod, Src1Mod, Src2Mod, 0>.ret,
-                          (ins bitop3_0:$bitop3, op_sel0:$op_sel));
-
-  let Asm64 = "$vdst, $src0, $src1, $src2$bitop3";
-  let AsmVOP3OpSel = !subst("$op_sel", "$bitop3$op_sel", getAsmVOP3OpSel<3, 0, 0, 0, 0, 0>.ret);
+  let HasBitOp3 = 1;
 }
 
 class VOP3_CVT_SCALE_F1632_FP8BF8_Profile<ValueType DstTy> : VOP3_Profile<VOPProfile<[DstTy, i32, f32, untyped]>,
@@ -1055,6 +1044,7 @@ class VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOPProfile P> : VOP3_Profil
 
 class VOP3_CVT_SCALE_FP4_F32_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<P> {
   let HasFP8DstByteSel = 1;
+  let HasFP8ByteSel = 0; // It works as a dst-bytesel, but does not have byte_sel operand.
 }
 
 class VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<P> {
@@ -1063,6 +1053,7 @@ class VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOPProfile P> : VOP3_
                           FP32InputMods:$src2_modifiers, Src2RC64:$src2,
                           VGPR_32:$vdst_in, op_sel0:$op_sel);
   let HasFP8DstByteSel = 1;
+  let HasFP8ByteSel = 0; // It works as a dst-bytesel, but does not have byte_sel operand.
 }
 
 
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 952ee2f..4cd845a 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1478,6 +1478,9 @@ class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VO
   let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel);
   let IsMAI    = !if(Features.IsMAI,    1, P.IsMAI);
   let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
+  let HasFP8SrcByteSel = P.HasFP8SrcByteSel;
+  let HasFP8DstByteSel = P.HasFP8DstByteSel;
+  let HasOMod = P.HasOMod;
 
   let HasModifiers =
       !if (Features.IsMAI, 0,
@@ -1494,6 +1497,9 @@ class VOP3_Profile_True16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> :
   let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel);
   let IsMAI    = !if(Features.IsMAI,    1, P.IsMAI);
   let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
+  let HasFP8SrcByteSel = P.HasFP8SrcByteSel;
+  let HasFP8DstByteSel = P.HasFP8DstByteSel;
+  let HasOMod = P.HasOMod;
 
   let HasModifiers =
       !if (Features.IsMAI, 0,
@@ -1506,6 +1512,9 @@ class VOP3_Profile_Fake16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> :
   let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel);
   let IsMAI    = !if(Features.IsMAI,    1, P.IsMAI);
   let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
+  let HasFP8SrcByteSel = P.HasFP8SrcByteSel;
+  let HasFP8DstByteSel = P.HasFP8DstByteSel;
+  let HasOMod = P.HasOMod;
 
   let HasModifiers =
       !if (Features.IsMAI, 0,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 574281d..8455eef 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -613,7 +613,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
       { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
       { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
       { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::UO_F64,  "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::UO_F64,  "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 
       // Single-precision floating-point arithmetic helper functions
       // RTABI chapter 4.1.2, Table 4
@@ -630,7 +630,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
       { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
       { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
       { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::UO_F32,  "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::UO_F32,  "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 
       // Floating-point to integer conversions.
       // RTABI chapter 4.1.2, Table 6
@@ -1275,50 +1275,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::UREM, MVT::i64, Custom);
     HasStandaloneRem = false;
 
-    if (Subtarget->isTargetWindows()) {
-      const struct {
-        const RTLIB::Libcall Op;
-        const char * const Name;
-        const CallingConv::ID CC;
-      } LibraryCalls[] = {
-        { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },
-        { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },
-        { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },
-        { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },
-
-        { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },
-        { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },
-        { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },
-        { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },
-      };
-
-      for (const auto &LC : LibraryCalls) {
-        setLibcallName(LC.Op, LC.Name);
-        setLibcallCallingConv(LC.Op, LC.CC);
-      }
-    } else {
-      const struct {
-        const RTLIB::Libcall Op;
-        const char * const Name;
-        const CallingConv::ID CC;
-      } LibraryCalls[] = {
-        { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
-        { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
-        { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
-        { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },
-
-        { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
-        { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
-        { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
-        { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },
-      };
-
-      for (const auto &LC : LibraryCalls) {
-        setLibcallName(LC.Op, LC.Name);
-        setLibcallCallingConv(LC.Op, LC.CC);
-      }
-    }
-
     setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
     setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
     setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index 7329d3f..890a22f 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -348,31 +348,11 @@ public:
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
   bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
 
-  // ARM EABI is the bare-metal EABI described in ARM ABI documents and
-  // can be accessed via -target arm-none-eabi. This is NOT GNUEABI.
-  // FIXME: Add a flag for bare-metal for that target and set Triple::EABI
-  // even for GNUEABI, so we can make a distinction here and still conform to
-  // the EABI on GNU (and Android) mode. This requires change in Clang, too.
-  // FIXME: The Darwin exception is temporary, while we move users to
-  // "*-*-*-macho" triples as quickly as possible.
-  bool isTargetAEABI() const {
-    return (TargetTriple.getEnvironment() == Triple::EABI ||
-            TargetTriple.getEnvironment() == Triple::EABIHF) &&
-           !isTargetDarwin() && !isTargetWindows();
-  }
-  bool isTargetGNUAEABI() const {
-    return (TargetTriple.getEnvironment() == Triple::GNUEABI ||
-            TargetTriple.getEnvironment() == Triple::GNUEABIT64 ||
-            TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
-            TargetTriple.getEnvironment() == Triple::GNUEABIHFT64) &&
-           !isTargetDarwin() && !isTargetWindows();
-  }
-  bool isTargetMuslAEABI() const {
-    return (TargetTriple.getEnvironment() == Triple::MuslEABI ||
-            TargetTriple.getEnvironment() == Triple::MuslEABIHF ||
-            TargetTriple.getEnvironment() == Triple::OpenHOS) &&
-           !isTargetDarwin() && !isTargetWindows();
-  }
+  bool isTargetAEABI() const { return TargetTriple.isTargetAEABI(); }
+
+  bool isTargetGNUAEABI() const { return TargetTriple.isTargetGNUAEABI(); }
+
+  bool isTargetMuslAEABI() const { return TargetTriple.isTargetMuslAEABI(); }
 
   // ARM Targets that support EHABI exception handling standard
   // Darwin uses SjLj. Other targets might need more checks.
diff --git a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
index d29ef24..8c7bc2f 100644
--- a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
+++ b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
@@ -414,9 +414,7 @@ static void replaceWithGEP(CallInst *Call, uint32_t DimensionIndex,
 
   Constant *Zero =
       ConstantInt::get(Type::getInt32Ty(Call->getParent()->getContext()), 0);
-  SmallVector<Value *, 4> IdxList;
-  for (unsigned I = 0; I < Dimension; ++I)
-    IdxList.push_back(Zero);
+  SmallVector<Value *, 4> IdxList(Dimension, Zero);
   IdxList.push_back(Call->getArgOperand(GEPIndex));
 
   auto *GEP = GetElementPtrInst::CreateInBounds(getBaseElementType(Call),
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 20260c8..b968e05 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -182,6 +182,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
   if (Subtarget.hasBasicF()) {
     setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
     setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+    setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
+    setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
     setCondCodeAction(FPCCToExpand, MVT::f32, Expand);
 
     setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
@@ -203,6 +205,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
                        Subtarget.isSoftFPABI() ? LibCall : Custom);
     setOperationAction(ISD::FP_TO_FP16, MVT::f32,
                        Subtarget.isSoftFPABI() ? LibCall : Custom);
+    setOperationAction(ISD::BF16_TO_FP, MVT::f32, Custom);
+    setOperationAction(ISD::FP_TO_BF16, MVT::f32,
+                       Subtarget.isSoftFPABI() ? LibCall : Custom);
 
     if (Subtarget.is64Bit())
       setOperationAction(ISD::FRINT, MVT::f32, Legal);
@@ -221,6 +226,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
   if (Subtarget.hasBasicD()) {
     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+    setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
+    setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
     setTruncStoreAction(MVT::f64, MVT::f16, Expand);
     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
     setCondCodeAction(FPCCToExpand, MVT::f64, Expand);
@@ -243,6 +250,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
     setOperationAction(ISD::FP_TO_FP16, MVT::f64,
                        Subtarget.isSoftFPABI() ? LibCall : Custom);
+    setOperationAction(ISD::BF16_TO_FP, MVT::f64, Custom);
+    setOperationAction(ISD::FP_TO_BF16, MVT::f64,
+                       Subtarget.isSoftFPABI() ? LibCall : Custom);
 
     if (Subtarget.is64Bit())
       setOperationAction(ISD::FRINT, MVT::f64, Legal);
@@ -499,6 +509,10 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
     return lowerFP_TO_FP16(Op, DAG);
   case ISD::FP16_TO_FP:
     return lowerFP16_TO_FP(Op, DAG);
+  case ISD::FP_TO_BF16:
+    return lowerFP_TO_BF16(Op, DAG);
+  case ISD::BF16_TO_FP:
+    return lowerBF16_TO_FP(Op, DAG);
   }
   return SDValue();
 }
@@ -2136,6 +2150,51 @@ static void canonicalizeShuffleVectorByLane(const SDLoc &DL,
   }
 }
 
+/// Lower VECTOR_SHUFFLE as lane permute and then shuffle (if possible).
+/// Only for 256-bit vector.
+///
+/// For example:
+/// %2 = shufflevector <4 x i64> %0, <4 x i64> posion,
+///                    <4 x i64> <i32 0, i32 3, i32 2, i32 0>
+/// is lowerded to:
+///     (XVPERMI $xr2, $xr0, 78)
+///     (XVSHUF  $xr1, $xr2, $xr0)
+///     (XVORI   $xr0, $xr1, 0)
+static SDValue lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(const SDLoc &DL,
+                                                          ArrayRef<int> Mask,
+                                                          MVT VT, SDValue V1,
+                                                          SDValue V2,
+                                                          SelectionDAG &DAG) {
+  assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
+  int Size = Mask.size();
+  int LaneSize = Size / 2;
+
+  bool LaneCrossing[2] = {false, false};
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
+      LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
+
+  // Ensure that all lanes ared involved.
+  if (!LaneCrossing[0] && !LaneCrossing[1])
+    return SDValue();
+
+  SmallVector<int> InLaneMask;
+  InLaneMask.assign(Mask.begin(), Mask.end());
+  for (int i = 0; i < Size; ++i) {
+    int &M = InLaneMask[i];
+    if (M < 0)
+      continue;
+    if (((M % Size) / LaneSize) != (i / LaneSize))
+      M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
+  }
+
+  SDValue Flipped = DAG.getBitcast(MVT::v4i64, V1);
+  Flipped = DAG.getVectorShuffle(MVT::v4i64, DL, Flipped,
+                                 DAG.getUNDEF(MVT::v4i64), {2, 3, 0, 1});
+  Flipped = DAG.getBitcast(VT, Flipped);
+  return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
+}
+
 /// Dispatching routine to lower various 256-bit LoongArch vector shuffles.
 ///
 /// This routine breaks down the specific type of 256-bit shuffle and
@@ -2168,6 +2227,9 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
       return Result;
     if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG)))
       return Result;
+    if ((Result = lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(DL, NewMask, VT,
+                                                             V1, V2, DAG)))
+      return Result;
 
     // TODO: This comment may be enabled in the future to better match the
     // pattern for instruction selection.
@@ -2285,6 +2347,36 @@ SDValue LoongArchTargetLowering::lowerFP16_TO_FP(SDValue Op,
   return Res;
 }
 
+SDValue LoongArchTargetLowering::lowerFP_TO_BF16(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  assert(Subtarget.hasBasicF() && "Unexpected custom legalization");
+  SDLoc DL(Op);
+  MakeLibCallOptions CallOptions;
+  RTLIB::Libcall LC =
+      RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16);
+  SDValue Res =
+      makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
+  if (Subtarget.is64Bit())
+    return DAG.getNode(LoongArchISD::MOVFR2GR_S_LA64, DL, MVT::i64, Res);
+  return DAG.getBitcast(MVT::i32, Res);
+}
+
+SDValue LoongArchTargetLowering::lowerBF16_TO_FP(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  assert(Subtarget.hasBasicF() && "Unexpected custom legalization");
+  MVT VT = Op.getSimpleValueType();
+  SDLoc DL(Op);
+  Op = DAG.getNode(
+      ISD::SHL, DL, Op.getOperand(0).getValueType(), Op.getOperand(0),
+      DAG.getShiftAmountConstant(16, Op.getOperand(0).getValueType(), DL));
+  SDValue Res = Subtarget.is64Bit() ? DAG.getNode(LoongArchISD::MOVGR2FR_W_LA64,
+                                                  DL, MVT::f32, Op)
+                                    : DAG.getBitcast(MVT::f32, Op);
+  if (VT != MVT::f32)
+    return DAG.getNode(ISD::FP_EXTEND, DL, VT, Res);
+  return Res;
+}
+
 static bool isConstantOrUndef(const SDValue Op) {
   if (Op->isUndef())
     return true;
@@ -7945,8 +8037,9 @@ bool LoongArchTargetLowering::splitValueIntoRegisterParts(
   bool IsABIRegCopy = CC.has_value();
   EVT ValueVT = Val.getValueType();
 
-  if (IsABIRegCopy && ValueVT == MVT::f16 && PartVT == MVT::f32) {
-    // Cast the f16 to i16, extend to i32, pad with ones to make a float
+  if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
+      PartVT == MVT::f32) {
+    // Cast the [b]f16 to i16, extend to i32, pad with ones to make a float
     // nan, and cast to f32.
     Val = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Val);
     Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Val);
@@ -7965,10 +8058,11 @@ SDValue LoongArchTargetLowering::joinRegisterPartsIntoValue(
     MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
   bool IsABIRegCopy = CC.has_value();
 
-  if (IsABIRegCopy && ValueVT == MVT::f16 && PartVT == MVT::f32) {
+  if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
+      PartVT == MVT::f32) {
     SDValue Val = Parts[0];
 
-    // Cast the f32 to i32, truncate to i16, and cast back to f16.
+    // Cast the f32 to i32, truncate to i16, and cast back to [b]f16.
     Val = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Val);
     Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Val);
     Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 4b6d3272..53e3f1a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -373,6 +373,8 @@ private:
   SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerBF16_TO_FP(SDValue Op, SelectionDAG &DAG) const;
 
   bool isFPImmLegal(const APFloat &Imm, EVT VT,
                     bool ForCodeSize) const override;
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index 6b058d1..fd850fa 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -482,7 +482,8 @@ def ProcessorFeatures {
   // Future
   // For future CPU we assume that all of the existing features from Power11
   // still exist with the exception of those we know are Power11 specific.
-  list<SubtargetFeature> FutureAdditionalFeatures = [FeatureISAFuture];
+  list<SubtargetFeature> FutureAdditionalFeatures = [DirectivePwrFuture,
+                                                     FeatureISAFuture];
   list<SubtargetFeature> FutureSpecificFeatures = [];
   list<SubtargetFeature> FutureInheritableFeatures =
     !listconcat(P11InheritableFeatures, FutureAdditionalFeatures);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 0c2a506..59c8998 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1428,39 +1428,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setTargetDAGCombine({ISD::TRUNCATE, ISD::SETCC, ISD::SELECT_CC});
   }
 
-  setLibcallName(RTLIB::LOG_F128, "logf128");
-  setLibcallName(RTLIB::LOG2_F128, "log2f128");
-  setLibcallName(RTLIB::LOG10_F128, "log10f128");
-  setLibcallName(RTLIB::EXP_F128, "expf128");
-  setLibcallName(RTLIB::EXP2_F128, "exp2f128");
-  setLibcallName(RTLIB::SIN_F128, "sinf128");
-  setLibcallName(RTLIB::COS_F128, "cosf128");
-  setLibcallName(RTLIB::SINCOS_F128, "sincosf128");
-  setLibcallName(RTLIB::POW_F128, "powf128");
-  setLibcallName(RTLIB::FMIN_F128, "fminf128");
-  setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
-  setLibcallName(RTLIB::REM_F128, "fmodf128");
-  setLibcallName(RTLIB::SQRT_F128, "sqrtf128");
-  setLibcallName(RTLIB::CEIL_F128, "ceilf128");
-  setLibcallName(RTLIB::FLOOR_F128, "floorf128");
-  setLibcallName(RTLIB::TRUNC_F128, "truncf128");
-  setLibcallName(RTLIB::ROUND_F128, "roundf128");
-  setLibcallName(RTLIB::LROUND_F128, "lroundf128");
-  setLibcallName(RTLIB::LLROUND_F128, "llroundf128");
-  setLibcallName(RTLIB::RINT_F128, "rintf128");
-  setLibcallName(RTLIB::LRINT_F128, "lrintf128");
-  setLibcallName(RTLIB::LLRINT_F128, "llrintf128");
-  setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128");
-  setLibcallName(RTLIB::FMA_F128, "fmaf128");
-  setLibcallName(RTLIB::FREXP_F128, "frexpf128");
-
-  if (Subtarget.isAIXABI()) {
-    setLibcallName(RTLIB::MEMCPY, isPPC64 ? "___memmove64" : "___memmove");
-    setLibcallName(RTLIB::MEMMOVE, isPPC64 ? "___memmove64" : "___memmove");
-    setLibcallName(RTLIB::MEMSET, isPPC64 ? "___memset64" : "___memset");
-    setLibcallName(RTLIB::BZERO, isPPC64 ? "___bzero64" : "___bzero");
-  }
-
   // With 32 condition bits, we don't need to sink (and duplicate) compares
   // aggressively in CodeGenPrep.
   if (Subtarget.useCRBits()) {
@@ -1476,7 +1443,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
 
   setMinFunctionAlignment(Align(4));
 
-  switch (Subtarget.getCPUDirective()) {
+  auto CPUDirective = Subtarget.getCPUDirective();
+  switch (CPUDirective) {
   default: break;
   case PPC::DIR_970:
   case PPC::DIR_A2:
@@ -1508,15 +1476,14 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
 
   // The Freescale cores do better with aggressive inlining of memcpy and
   // friends. GCC uses same threshold of 128 bytes (= 32 word stores).
-  if (Subtarget.getCPUDirective() == PPC::DIR_E500mc ||
-      Subtarget.getCPUDirective() == PPC::DIR_E5500) {
+  if (CPUDirective == PPC::DIR_E500mc || CPUDirective == PPC::DIR_E5500) {
     MaxStoresPerMemset = 32;
     MaxStoresPerMemsetOptSize = 16;
     MaxStoresPerMemcpy = 32;
     MaxStoresPerMemcpyOptSize = 8;
     MaxStoresPerMemmove = 32;
     MaxStoresPerMemmoveOptSize = 8;
-  } else if (Subtarget.getCPUDirective() == PPC::DIR_A2) {
+  } else if (CPUDirective == PPC::DIR_A2) {
     // The A2 also benefits from (very) aggressive inlining of memcpy and
     // friends. The overhead of a the function call, even when warm, can be
     // over one hundred cycles.
@@ -1529,6 +1496,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     MaxLoadsPerMemcmpOptSize = 4;
   }
 
+  // Enable generation of STXVP instructions by default for mcpu=future.
+  if (CPUDirective == PPC::DIR_PWR_FUTURE &&
+      DisableAutoPairedVecSt.getNumOccurrences() == 0)
+    DisableAutoPairedVecSt = false;
+
   IsStrictFPEnabled = true;
 
   // Let the subtarget (CPU) decide if a predictable select is more expensive
@@ -9667,7 +9639,24 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     }
   }
 
-  if (!BVNIsConstantSplat || SplatBitSize > 32) {
+  bool IsSplat64 = false;
+  uint64_t SplatBits = 0;
+  int32_t SextVal = 0;
+  if (BVNIsConstantSplat && SplatBitSize <= 64) {
+    SplatBits = APSplatBits.getZExtValue();
+    if (SplatBitSize <= 32) {
+      SextVal = SignExtend32(SplatBits, SplatBitSize);
+    } else if (SplatBitSize == 64 && Subtarget.hasP8Altivec()) {
+      int64_t Splat64Val = static_cast<int64_t>(SplatBits);
+      bool P9Vector = Subtarget.hasP9Vector();
+      int32_t Hi = P9Vector ? 127 : 15;
+      int32_t Lo = P9Vector ? -128 : -16;
+      IsSplat64 = Splat64Val >= Lo && Splat64Val <= Hi;
+      SextVal = static_cast<int32_t>(SplatBits);
+    }
+  }
+
+  if (!BVNIsConstantSplat || (SplatBitSize > 32 && !IsSplat64)) {
     unsigned NewOpcode = PPCISD::LD_SPLAT;
 
     // Handle load-and-splat patterns as we have instructions that will do this
@@ -9753,7 +9742,6 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     return SDValue();
   }
 
-  uint64_t SplatBits = APSplatBits.getZExtValue();
   uint64_t SplatUndef = APSplatUndef.getZExtValue();
   unsigned SplatSize = SplatBitSize / 8;
 
@@ -9788,13 +9776,43 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
                                   dl);
 
   // If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
-  int32_t SextVal = SignExtend32(SplatBits, SplatBitSize);
-  if (SextVal >= -16 && SextVal <= 15)
-    return getCanonicalConstSplat(SextVal, SplatSize, Op.getValueType(), DAG,
-                                  dl);
+  // Use VSPLTIW/VUPKLSW for v2i64 in range [-16,15].
+  if (SextVal >= -16 && SextVal <= 15) {
+    // SplatSize may be 1, 2, 4, or 8. Use size 4 instead of 8 for the splat to
+    // generate a splat word with extend for size 8.
+    unsigned UseSize = SplatSize == 8 ? 4 : SplatSize;
+    SDValue Res =
+        getCanonicalConstSplat(SextVal, UseSize, Op.getValueType(), DAG, dl);
+    if (SplatSize != 8)
+      return Res;
+    return BuildIntrinsicOp(Intrinsic::ppc_altivec_vupklsw, Res, DAG, dl);
+  }
 
   // Two instruction sequences.
 
+  if (Subtarget.hasP9Vector() && SextVal >= -128 && SextVal <= 127) {
+    SDValue C = DAG.getConstant((unsigned char)SextVal, dl, MVT::i32);
+    SmallVector<SDValue, 16> Ops(16, C);
+    SDValue BV = DAG.getBuildVector(MVT::v16i8, dl, Ops);
+    unsigned IID;
+    switch (SplatSize) {
+    default:
+      llvm_unreachable("Unexpected type for vector constant.");
+    case 2:
+      IID = Intrinsic::ppc_altivec_vupklsb;
+      break;
+    case 4:
+      IID = Intrinsic::ppc_altivec_vextsb2w;
+      break;
+    case 8:
+      IID = Intrinsic::ppc_altivec_vextsb2d;
+      break;
+    }
+    SDValue Extend = BuildIntrinsicOp(IID, BV, DAG, dl);
+    return DAG.getBitcast(Op->getValueType(0), Extend);
+  }
+  assert(!IsSplat64 && "Unhandled 64-bit splat pattern");
+
   // If this value is in the range [-32,30] and is even, use:
   //     VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2)
   // If this value is in the range [17,31] and is odd, use:
diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
index 1c53b74..1835065 100644
--- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -42,6 +42,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DebugCounter.h"
 
@@ -997,9 +998,9 @@ bool PPCMIPeephole::simplifyCode() {
           // the transformation.
           bool IsWordAligned = false;
           if (SrcMI->getOperand(1).isGlobal()) {
-            const GlobalObject *GO =
-                dyn_cast<GlobalObject>(SrcMI->getOperand(1).getGlobal());
-            if (GO && GO->getAlign() && *GO->getAlign() >= 4 &&
+            const GlobalVariable *GV =
+                dyn_cast<GlobalVariable>(SrcMI->getOperand(1).getGlobal());
+            if (GV && GV->getAlign() && *GV->getAlign() >= 4 &&
                 (SrcMI->getOperand(1).getOffset() % 4 == 0))
               IsWordAligned = true;
           } else if (SrcMI->getOperand(1).isImm()) {
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 690068d..83eefc0 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -75,7 +75,8 @@ def FeatureStdExtI
       RISCVExtensionBitmask<0, 8>;
 
 def FeatureStdExtE
-    : RISCVExtension<2, 0, "Embedded Instruction Set with 16 GPRs">;
+    : RISCVExtension<2, 0, "Embedded Instruction Set with 16 GPRs">,
+      RISCVExtensionBitmask<0, 4>;
 
 def FeatureStdExtZic64b
     : RISCVExtension<1, 0, "Cache Block Size Is 64 Bytes">;
@@ -510,7 +511,8 @@ def HasStdExtZbs : Predicate<"Subtarget->hasStdExtZbs()">,
 
 def FeatureStdExtB
     : RISCVExtension<1, 0, "the collection of the Zba, Zbb, Zbs extensions",
-                     [FeatureStdExtZba, FeatureStdExtZbb, FeatureStdExtZbs]>;
+                     [FeatureStdExtZba, FeatureStdExtZbb, FeatureStdExtZbs]>,
+      RISCVExtensionBitmask<0, 1>;
 
 def FeatureStdExtZbkb
     : RISCVExtension<1, 0, "Bitmanip instructions for Cryptography">,
@@ -887,8 +889,8 @@ def HasVInstructionsFullMultiply : Predicate<"Subtarget->hasVInstructionsFullMul
 
 // Hypervisor Extensions
 
-def FeatureStdExtH : RISCVExtension<1, 0, "Hypervisor">;
-
+def FeatureStdExtH : RISCVExtension<1, 0, "Hypervisor">,
+                     RISCVExtensionBitmask<0, 7>;
 def HasStdExtH : Predicate<"Subtarget->hasStdExtH()">,
                  AssemblerPredicate<(all_of FeatureStdExtH),
                                     "'H' (Hypervisor)">;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 494d6ed..babc0d7 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -613,8 +613,8 @@ bool RISCVDAGToDAGISel::trySignedBitfieldExtract(SDNode *Node) {
   if (!N0.hasOneUse())
     return false;
 
-  auto BitfieldExtract = [&](SDValue N0, unsigned Msb, unsigned Lsb, SDLoc DL,
-                             MVT VT) {
+  auto BitfieldExtract = [&](SDValue N0, unsigned Msb, unsigned Lsb,
+                             const SDLoc &DL, MVT VT) {
     unsigned Opc =
         Subtarget->hasVendorXTHeadBb() ? RISCV::TH_EXT : RISCV::NDS_BFOS;
     return CurDAG->getMachineNode(Opc, DL, VT, N0.getOperand(0),
@@ -671,15 +671,26 @@ bool RISCVDAGToDAGISel::trySignedBitfieldExtract(SDNode *Node) {
   return false;
 }
 
-bool RISCVDAGToDAGISel::tryUnsignedBitfieldExtract(SDNode *Node, SDLoc DL,
-                                                   MVT VT, SDValue X,
-                                                   unsigned Msb, unsigned Lsb) {
-  // Only supported with XTHeadBb/XAndesPerf at the moment.
-  if (!Subtarget->hasVendorXTHeadBb() && !Subtarget->hasVendorXAndesPerf())
+bool RISCVDAGToDAGISel::tryUnsignedBitfieldExtract(SDNode *Node,
+                                                   const SDLoc &DL, MVT VT,
+                                                   SDValue X, unsigned Msb,
+                                                   unsigned Lsb) {
+  unsigned Opc;
+
+  if (Subtarget->hasVendorXTHeadBb()) {
+    Opc = RISCV::TH_EXTU;
+  } else if (Subtarget->hasVendorXAndesPerf()) {
+    Opc = RISCV::NDS_BFOZ;
+  } else if (Subtarget->hasVendorXqcibm()) {
+    Opc = RISCV::QC_EXTU;
+    // QC.EXTU X, width, shamt
+    // shamt is the same as Lsb
+    // width is the number of bits to extract from the Lsb
+    Msb = Msb - Lsb + 1;
+  } else {
+    // Only supported with XTHeadBb/XAndesPerf/Xqcibm at the moment.
     return false;
-
-  unsigned Opc =
-      Subtarget->hasVendorXTHeadBb() ? RISCV::TH_EXTU : RISCV::NDS_BFOZ;
+  }
 
   SDNode *Ube = CurDAG->getMachineNode(Opc, DL, VT, X,
                                        CurDAG->getTargetConstant(Msb, DL, VT),
@@ -688,9 +699,9 @@ bool RISCVDAGToDAGISel::tryUnsignedBitfieldExtract(SDNode *Node, SDLoc DL,
   return true;
 }
 
-bool RISCVDAGToDAGISel::tryUnsignedBitfieldInsertInZero(SDNode *Node, SDLoc DL,
-                                                        MVT VT, SDValue X,
-                                                        unsigned Msb,
+bool RISCVDAGToDAGISel::tryUnsignedBitfieldInsertInZero(SDNode *Node,
+                                                        const SDLoc &DL, MVT VT,
+                                                        SDValue X, unsigned Msb,
                                                         unsigned Lsb) {
   // Only supported with XAndesPerf at the moment.
   if (!Subtarget->hasVendorXAndesPerf())
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index f199c20..ccbba88 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -77,9 +77,9 @@ public:
 
   bool tryShrinkShlLogicImm(SDNode *Node);
   bool trySignedBitfieldExtract(SDNode *Node);
-  bool tryUnsignedBitfieldExtract(SDNode *Node, SDLoc DL, MVT VT, SDValue X,
-                                  unsigned Msb, unsigned Lsb);
-  bool tryUnsignedBitfieldInsertInZero(SDNode *Node, SDLoc DL, MVT VT,
+  bool tryUnsignedBitfieldExtract(SDNode *Node, const SDLoc &DL, MVT VT,
+                                  SDValue X, unsigned Msb, unsigned Lsb);
+  bool tryUnsignedBitfieldInsertInZero(SDNode *Node, const SDLoc &DL, MVT VT,
                                        SDValue X, unsigned Msb, unsigned Lsb);
   bool tryIndexedLoad(SDNode *Node);
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 1cbd3f4..cefcd91 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1574,7 +1574,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   // zve32x is broken for partial_reduce_umla, but let's not make it worse.
   if (Subtarget.hasStdExtZvqdotq() && Subtarget.getELen() >= 64) {
     static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
-                                      ISD::PARTIAL_REDUCE_UMLA};
+                                      ISD::PARTIAL_REDUCE_UMLA,
+                                      ISD::PARTIAL_REDUCE_SUMLA};
     setPartialReduceMLAAction(MLAOps, MVT::nxv1i32, MVT::nxv4i8, Custom);
     setPartialReduceMLAAction(MLAOps, MVT::nxv2i32, MVT::nxv8i8, Custom);
     setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv16i8, Custom);
@@ -8318,6 +8319,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     return lowerADJUST_TRAMPOLINE(Op, DAG);
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
+  case ISD::PARTIAL_REDUCE_SUMLA:
     return lowerPARTIAL_REDUCE_MLA(Op, DAG);
   }
 }
@@ -8534,8 +8536,20 @@ SDValue RISCVTargetLowering::lowerPARTIAL_REDUCE_MLA(SDValue Op,
     B = convertToScalableVector(ContainerVT, B, DAG, Subtarget);
   }
 
-  bool IsSigned = Op.getOpcode() == ISD::PARTIAL_REDUCE_SMLA;
-  unsigned Opc = IsSigned ? RISCVISD::VQDOT_VL : RISCVISD::VQDOTU_VL;
+  unsigned Opc;
+  switch (Op.getOpcode()) {
+  case ISD::PARTIAL_REDUCE_SMLA:
+    Opc = RISCVISD::VQDOT_VL;
+    break;
+  case ISD::PARTIAL_REDUCE_UMLA:
+    Opc = RISCVISD::VQDOTU_VL;
+    break;
+  case ISD::PARTIAL_REDUCE_SUMLA:
+    Opc = RISCVISD::VQDOTSU_VL;
+    break;
+  default:
+    llvm_unreachable("Unexpected opcode");
+  }
   auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
   SDValue Res = DAG.getNode(Opc, DL, ContainerVT, {A, B, Accum, Mask, VL});
   if (VT.isFixedLengthVector())
@@ -18358,31 +18372,6 @@ static SDValue performBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
                      DAG.getBuildVector(VT, DL, RHSOps));
 }
 
-static SDValue lowerVQDOT(unsigned Opc, SDValue Op0, SDValue Op1,
-                          const SDLoc &DL, SelectionDAG &DAG,
-                          const RISCVSubtarget &Subtarget) {
-  assert(RISCVISD::VQDOT_VL == Opc || RISCVISD::VQDOTU_VL == Opc ||
-         RISCVISD::VQDOTSU_VL == Opc);
-  MVT VT = Op0.getSimpleValueType();
-  assert(VT == Op1.getSimpleValueType() &&
-         VT.getVectorElementType() == MVT::i32);
-
-  SDValue Passthru = DAG.getConstant(0, DL, VT);
-  MVT ContainerVT = VT;
-  if (VT.isFixedLengthVector()) {
-    ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
-    Passthru = convertToScalableVector(ContainerVT, Passthru, DAG, Subtarget);
-    Op0 = convertToScalableVector(ContainerVT, Op0, DAG, Subtarget);
-    Op1 = convertToScalableVector(ContainerVT, Op1, DAG, Subtarget);
-  }
-  auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
-  SDValue LocalAccum = DAG.getNode(Opc, DL, ContainerVT,
-                                   {Op0, Op1, Passthru, Mask, VL});
-  if (VT.isFixedLengthVector())
-    return convertFromScalableVector(VT, LocalAccum, DAG, Subtarget);
-  return LocalAccum;
-}
-
 static MVT getQDOTXResultType(MVT OpVT) {
   ElementCount OpEC = OpVT.getVectorElementCount();
   assert(OpEC.isKnownMultipleOf(4) && OpVT.getVectorElementType() == MVT::i8);
@@ -18441,61 +18430,62 @@ static SDValue foldReduceOperandViaVQDOT(SDValue InVec, const SDLoc &DL,
     }
   }
 
-  // reduce (zext a) <--> reduce (mul zext a. zext 1)
-  // reduce (sext a) <--> reduce (mul sext a. sext 1)
+  // zext a <--> partial_reduce_umla 0, a, 1
+  // sext a <--> partial_reduce_smla 0, a, 1
   if (InVec.getOpcode() == ISD::ZERO_EXTEND ||
       InVec.getOpcode() == ISD::SIGN_EXTEND) {
     SDValue A = InVec.getOperand(0);
-    if (A.getValueType().getVectorElementType() != MVT::i8 ||
-        !TLI.isTypeLegal(A.getValueType()))
+    EVT OpVT = A.getValueType();
+    if (OpVT.getVectorElementType() != MVT::i8 || !TLI.isTypeLegal(OpVT))
       return SDValue();
 
     MVT ResVT = getQDOTXResultType(A.getSimpleValueType());
-    A = DAG.getBitcast(ResVT, A);
-    SDValue B = DAG.getConstant(0x01010101, DL, ResVT);
-
+    SDValue B = DAG.getConstant(0x1, DL, OpVT);
     bool IsSigned = InVec.getOpcode() == ISD::SIGN_EXTEND;
-    unsigned Opc = IsSigned ? RISCVISD::VQDOT_VL : RISCVISD::VQDOTU_VL;
-    return lowerVQDOT(Opc, A, B, DL, DAG, Subtarget);
+    unsigned Opc =
+        IsSigned ? ISD::PARTIAL_REDUCE_SMLA : ISD::PARTIAL_REDUCE_UMLA;
+    return DAG.getNode(Opc, DL, ResVT, {DAG.getConstant(0, DL, ResVT), A, B});
   }
 
-  // mul (sext, sext) -> vqdot
-  // mul (zext, zext) -> vqdotu
-  // mul (sext, zext) -> vqdotsu
-  // mul (zext, sext) -> vqdotsu (swapped)
-  // TODO: Improve .vx handling - we end up with a sub-vector insert
-  // which confuses the splat pattern matching.  Also, match vqdotus.vx
+  // mul (sext a, sext b) -> partial_reduce_smla 0, a, b
+  // mul (zext a, zext b) -> partial_reduce_umla 0, a, b
+  // mul (sext a, zext b) -> partial_reduce_ssmla 0, a, b
+  // mul (zext a, sext b) -> partial_reduce_smla 0, b, a (swapped)
   if (InVec.getOpcode() != ISD::MUL)
     return SDValue();
 
   SDValue A = InVec.getOperand(0);
   SDValue B = InVec.getOperand(1);
-  unsigned Opc = 0;
-  if (A.getOpcode() == B.getOpcode()) {
-    if (A.getOpcode() == ISD::SIGN_EXTEND)
-      Opc = RISCVISD::VQDOT_VL;
-    else if (A.getOpcode() == ISD::ZERO_EXTEND)
-      Opc = RISCVISD::VQDOTU_VL;
-    else
-      return SDValue();
-  } else {
-    if (B.getOpcode() != ISD::ZERO_EXTEND)
-      std::swap(A, B);
-    if (A.getOpcode() != ISD::SIGN_EXTEND || B.getOpcode() != ISD::ZERO_EXTEND)
-      return SDValue();
-    Opc = RISCVISD::VQDOTSU_VL;
-  }
-  assert(Opc);
 
-  if (A.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
-      A.getOperand(0).getValueType() != B.getOperand(0).getValueType() ||
+  if (!ISD::isExtOpcode(A.getOpcode()))
+    return SDValue();
+
+  EVT OpVT = A.getOperand(0).getValueType();
+  if (OpVT.getVectorElementType() != MVT::i8 ||
+      OpVT != B.getOperand(0).getValueType() ||
       !TLI.isTypeLegal(A.getValueType()))
     return SDValue();
 
-  MVT ResVT = getQDOTXResultType(A.getOperand(0).getSimpleValueType());
-  A = DAG.getBitcast(ResVT, A.getOperand(0));
-  B = DAG.getBitcast(ResVT, B.getOperand(0));
-  return lowerVQDOT(Opc, A, B, DL, DAG, Subtarget);
+  unsigned Opc;
+  if (A.getOpcode() == ISD::SIGN_EXTEND && B.getOpcode() == ISD::SIGN_EXTEND)
+    Opc = ISD::PARTIAL_REDUCE_SMLA;
+  else if (A.getOpcode() == ISD::ZERO_EXTEND &&
+           B.getOpcode() == ISD::ZERO_EXTEND)
+    Opc = ISD::PARTIAL_REDUCE_UMLA;
+  else if (A.getOpcode() == ISD::SIGN_EXTEND &&
+           B.getOpcode() == ISD::ZERO_EXTEND)
+    Opc = ISD::PARTIAL_REDUCE_SUMLA;
+  else if (A.getOpcode() == ISD::ZERO_EXTEND &&
+           B.getOpcode() == ISD::SIGN_EXTEND) {
+    Opc = ISD::PARTIAL_REDUCE_SUMLA;
+    std::swap(A, B);
+  } else
+    return SDValue();
+
+  MVT ResVT = getQDOTXResultType(OpVT.getSimpleVT());
+  return DAG.getNode(
+      Opc, DL, ResVT,
+      {DAG.getConstant(0, DL, ResVT), A.getOperand(0), B.getOperand(0)});
 }
 
 static SDValue performVECREDUCECombine(SDNode *N, SelectionDAG &DAG,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index de424b3..b18a550 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -341,21 +341,33 @@ def simm12 : RISCVSImmLeafOp<12> {
 def simm12_no6 : ImmLeaf<XLenVT, [{
   return isInt<12>(Imm) && !isInt<6>(Imm) && isInt<12>(-Imm);}]>;
 
-// A 13-bit signed immediate where the least significant bit is zero.
-def bare_simm13_lsb0 : Operand<OtherVT> {
-  let ParserMatchClass = BareSImmNLsb0AsmOperand<13>;
-  let PrintMethod = "printBranchOperand";
-  let EncoderMethod = "getImmOpValueAsrN<1>";
-  let DecoderMethod = "decodeSImmOperandAndLslN<13, 1>";
+class BareSImm13Lsb0MaybeSym : Operand<OtherVT> {
   let MCOperandPredicate = [{
     int64_t Imm;
     if (MCOp.evaluateAsConstantImm(Imm))
       return isShiftedInt<12, 1>(Imm);
     return MCOp.isBareSymbolRef();
   }];
+}
+
+// A 13-bit signed immediate where the least significant bit is zero. The ImmLeaf
+// is needed so that the CompressInstEmitter can correctly add checks for the
+// compress patterns that involve instructions that use this operand. Similar to
+// bare_simm9_lsb0 in RISCVInstrInfoC.td.
+def bare_simm13_lsb0 : BareSImm13Lsb0MaybeSym,
+                       ImmLeaf<XLenVT, [{return isShiftedInt<12, 1>(Imm);}]> {
+  let ParserMatchClass = BareSImmNLsb0AsmOperand<13>;
+  let PrintMethod = "printBranchOperand";
+  let EncoderMethod = "getImmOpValueAsrN<1>";
+  let DecoderMethod = "decodeSImmOperandAndLslN<13, 1>";
   let OperandType = "OPERAND_PCREL";
 }
 
+// We need this (sort of) duplicate definition since adding ImmLeaf to
+// bare_simm13_lsb0 above makes it not sit well with codegen patterns where it
+// is used to match with a basic block (eg. BccPat<>).
+def bare_simm13_lsb0_bb : BareSImm13Lsb0MaybeSym;
+
 class UImm20OperandMaybeSym : RISCVUImmOp<20> {
   let MCOperandPredicate = [{
     int64_t Imm;
@@ -1660,10 +1672,10 @@ multiclass SelectCC_GPR_riirr<DAGOperand valty, DAGOperand imm> {
 // Match `riscv_brcc` and lower to the appropriate RISC-V branch instruction.
 multiclass BccPat<CondCode Cond, RVInstB Inst> {
   def : Pat<(riscv_brcc (XLenVT GPR:$rs1), GPR:$rs2, Cond, bb:$imm12),
-            (Inst GPR:$rs1, GPR:$rs2, bare_simm13_lsb0:$imm12)>;
+            (Inst GPR:$rs1, GPR:$rs2, bare_simm13_lsb0_bb:$imm12)>;
   // Explicitly select 0 to X0. The register coalescer doesn't always do it.
   def : Pat<(riscv_brcc (XLenVT GPR:$rs1), 0, Cond, bb:$imm12),
-            (Inst GPR:$rs1, (XLenVT X0), bare_simm13_lsb0:$imm12)>;
+            (Inst GPR:$rs1, (XLenVT X0), bare_simm13_lsb0_bb:$imm12)>;
 }
 
 class BrccCompressOpt<CondCode Cond, RVInstB Inst>
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td
index 5d7dad4..942dde4 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td
@@ -798,9 +798,9 @@ def IntCCtoRISCVCCCV : SDNodeXForm<riscv_selectcc, [{
 
 let Predicates = [HasVendorXCVbi, IsRV32], AddedComplexity = 2 in {
   def : Pat<(riscv_brcc GPR:$rs1, simm5:$imm5, SETEQ, bb:$imm12),
-            (CV_BEQIMM GPR:$rs1, simm5:$imm5, bare_simm13_lsb0:$imm12)>;
+            (CV_BEQIMM GPR:$rs1, simm5:$imm5, bare_simm13_lsb0_bb:$imm12)>;
   def : Pat<(riscv_brcc GPR:$rs1, simm5:$imm5, SETNE, bb:$imm12),
-            (CV_BNEIMM GPR:$rs1, simm5:$imm5, bare_simm13_lsb0:$imm12)>;
+            (CV_BNEIMM GPR:$rs1, simm5:$imm5, bare_simm13_lsb0_bb:$imm12)>;
 
   defm CC_SImm5_CV : SelectCC_GPR_riirr<GPR, simm5>;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index f292c93..dba035b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -19,6 +19,12 @@ def uimm5nonzero : RISCVOp<XLenVT>,
   let ParserMatchClass = UImmAsmOperand<5, "NonZero">;
   let DecoderMethod = "decodeUImmNonZeroOperand<5>";
   let OperandType = "OPERAND_UIMM5_NONZERO";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (!MCOp.evaluateAsConstantImm(Imm))
+      return false;
+    return (Imm != 0) && isUInt<5>(Imm);;
+  }];
 }
 
 def uimm5gt3 : RISCVOp<XLenVT>, ImmLeaf<XLenVT,
@@ -111,6 +117,12 @@ def simm5nonzero : RISCVOp<XLenVT>,
   let ParserMatchClass = SImmAsmOperand<5, "NonZero">;
   let DecoderMethod = "decodeSImmNonZeroOperand<5>";
   let OperandType = "OPERAND_SIMM5_NONZERO";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (!MCOp.evaluateAsConstantImm(Imm))
+      return false;
+    return (Imm != 0) && isInt<5>(Imm);
+  }];
 }
 
 def simm11 : RISCVSImmLeafOp<11>;
@@ -1322,11 +1334,11 @@ class QCScaledStPat<PatFrag StoreOp, RVInst Inst>
 // Match `riscv_brcc` and lower to the appropriate XQCIBI branch instruction.
 class BcciPat<CondCode Cond, QCIBranchInst_rii Inst, DAGOperand InTyImm>
     : Pat<(riscv_brcc (XLenVT GPRNoX0:$rs1), InTyImm:$rs2, Cond, bb:$imm12),
-          (Inst GPRNoX0:$rs1, InTyImm:$rs2, bare_simm13_lsb0:$imm12)>;
+          (Inst GPRNoX0:$rs1, InTyImm:$rs2, bare_simm13_lsb0_bb:$imm12)>;
 
 class Bcci48Pat<CondCode Cond, QCIBranchInst48_rii Inst, DAGOperand InTyImm>
     : Pat<(riscv_brcc (XLenVT GPRNoX0:$rs1), InTyImm:$rs2, Cond, bb:$imm12),
-          (Inst GPRNoX0:$rs1, InTyImm:$rs2, bare_simm13_lsb0:$imm12)>;
+          (Inst GPRNoX0:$rs1, InTyImm:$rs2, bare_simm13_lsb0_bb:$imm12)>;
 
 defm CC_SImm5NonZero_QC  : SelectCC_GPR_riirr<GPRNoX0, simm5nonzero>;
 defm CC_UImm5NonZero_QC  : SelectCC_GPR_riirr<GPRNoX0, uimm5nonzero>;
@@ -1543,3 +1555,18 @@ def : CompressPat<(QC_E_ADDI GPRNoX0:$rs1, GPRNoX0:$rs2, 0),
 def : CompressPat<(QC_E_ADDI X2, X2, simm10_lsb0000nonzero:$imm),
                   (C_ADDI16SP X2, simm10_lsb0000nonzero:$imm)>;
 } // let isCompressOnly = true, Predicates = [HasVendorXqcilia, IsRV32]
+
+let isCompressOnly = true, Predicates = [HasVendorXqcibi, IsRV32] in {
+def : CompressPat<(QC_E_BEQI GPRNoX0:$rs1, simm5nonzero:$imm5, bare_simm13_lsb0:$imm12),
+                  (QC_BEQI GPRNoX0:$rs1, simm5nonzero:$imm5, bare_simm13_lsb0:$imm12)>;
+def : CompressPat<(QC_E_BNEI GPRNoX0:$rs1, simm5nonzero:$imm5, bare_simm13_lsb0:$imm12),
+                  (QC_BNEI GPRNoX0:$rs1, simm5nonzero:$imm5, bare_simm13_lsb0:$imm12)>;
+def : CompressPat<(QC_E_BGEI GPRNoX0:$rs1, simm5nonzero:$imm5, bare_simm13_lsb0:$imm12),
+                  (QC_BGEI GPRNoX0:$rs1, simm5nonzero:$imm5, bare_simm13_lsb0:$imm12)>;
+def : CompressPat<(QC_E_BLTI GPRNoX0:$rs1, simm5nonzero:$imm5, bare_simm13_lsb0:$imm12),
+                  (QC_BLTI GPRNoX0:$rs1, simm5nonzero:$imm5, bare_simm13_lsb0:$imm12)>;
+def : CompressPat<(QC_E_BGEUI GPRNoX0:$rs1, uimm5nonzero:$imm5, bare_simm13_lsb0:$imm12),
+                  (QC_BGEUI GPRNoX0:$rs1, uimm5nonzero:$imm5, bare_simm13_lsb0:$imm12)>;
+def : CompressPat<(QC_E_BLTUI GPRNoX0:$rs1, uimm5nonzero:$imm5, bare_simm13_lsb0:$imm12),
+                  (QC_BLTUI GPRNoX0:$rs1, uimm5nonzero:$imm5, bare_simm13_lsb0:$imm12)>;
+} // let isCompressOnly = true, Predicates = [HasVendorXqcibi, IsRV32]
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index f062467..1c59b1e 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1283,6 +1283,20 @@ bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(
   return true;
 }
 
+bool SystemZTargetLowering::hasAndNot(SDValue Y) const {
+  EVT VT = Y.getValueType();
+
+  // We can use NC(G)RK for types in GPRs ...
+  if (VT == MVT::i32 || VT == MVT::i64)
+    return Subtarget.hasMiscellaneousExtensions3();
+
+  // ... or VNC for types in VRs.
+  if (VT.isVector() || VT == MVT::i128)
+    return Subtarget.hasVector();
+
+  return false;
+}
+
 // Information about the addressing mode for a memory access.
 struct AddressingMode {
   // True if a long displacement is supported.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index f3536a8..f2f0bf6 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -671,6 +671,7 @@ public:
   }
 
   unsigned getStackProbeSize(const MachineFunction &MF) const;
+  bool hasAndNot(SDValue Y) const override;
 
 private:
   const SystemZSubtarget &Subtarget;
diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
index 6c376e4..942ef88 100644
--- a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -8,7 +8,7 @@
 
 #include "SystemZSubtarget.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
@@ -83,9 +83,9 @@ bool SystemZSubtarget::isAddressedViaADA(const GlobalValue *GV) const {
     // least two byte alignment, then generated code can use relative
     // instructions to address the variable. Otherwise, use the ADA to address
     // the variable.
-    if (GO->getAlignment() & 0x1) {
-      return true;
-    }
+    if (auto *GV = dyn_cast<GlobalVariable>(GO))
+      if (GV->getAlign() && (*GV->getAlign()).value() & 0x1)
+        return true;
 
     // getKindForGlobal only works with definitions
     if (GO->isDeclaration()) {
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 7f06e0f..c7abb36 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -301,8 +301,9 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
       X86FL->emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue=*/true);
     }
 
+    // Use this predicate to set REX prefix for X86_64 targets.
+    bool IsX64 = STI->isTargetWin64() || STI->isTargetUEFI64();
     // Jump to label or value in register.
-    bool IsWin64 = STI->isTargetWin64();
     if (Opcode == X86::TCRETURNdi || Opcode == X86::TCRETURNdicc ||
         Opcode == X86::TCRETURNdi64 || Opcode == X86::TCRETURNdi64cc) {
       unsigned Op;
@@ -341,7 +342,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     } else if (Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64) {
       unsigned Op = (Opcode == X86::TCRETURNmi)
                         ? X86::TAILJMPm
-                        : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64);
+                        : (IsX64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64);
       MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
       for (unsigned i = 0; i != X86::AddrNumOperands; ++i)
         MIB.add(MBBI->getOperand(i));
@@ -349,10 +350,10 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
                (Opcode == X86::TCRETURNri64_ImpCall)) {
       JumpTarget.setIsKill();
       BuildMI(MBB, MBBI, DL,
-              TII->get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
+              TII->get(IsX64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
           .add(JumpTarget);
     } else {
-      assert(!IsWin64 && "Win64 requires REX for indirect jumps.");
+      assert(!IsX64 && "Win64 and UEFI64 require REX for indirect jumps.");
       JumpTarget.setIsKill();
       BuildMI(MBB, MBBI, DL, TII->get(X86::TAILJMPr))
           .add(JumpTarget);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a9dbb29..3db7e2c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4311,6 +4311,25 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops,
     }
   }
 
+  if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+    EVT VT = N->getValueType(0);
+    SDValue Src = N->getOperand(0);
+    uint64_t Idx = N->getConstantOperandVal(1);
+
+    // Collect all the subvectors from the source vector and slice off the
+    // extraction.
+    SmallVector<SDValue, 4> SrcOps;
+    if (collectConcatOps(Src.getNode(), SrcOps, DAG) &&
+        VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() &&
+        (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
+        (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
+      unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
+      unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits();
+      Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs);
+      return true;
+    }
+  }
+
   return false;
 }
 
@@ -9790,6 +9809,10 @@ static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
       (int)ExpectedVT.getVectorNumElements() != MaskSize)
     return false;
 
+  // Exact match.
+  if (Idx == ExpectedIdx && Op == ExpectedOp)
+    return true;
+
   switch (Op.getOpcode()) {
   case ISD::BUILD_VECTOR:
     // If the values are build vectors, we can look through them to find
@@ -9837,8 +9860,7 @@ static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
       SmallVector<int, 8> Mask;
       DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask);
       SDValue Src = Op.getOperand(0);
-      return (Mask[Idx] == Mask[ExpectedIdx]) ||
-             IsElementEquivalent(MaskSize, Src, Src, Mask[Idx],
+      return IsElementEquivalent(MaskSize, Src, Src, Mask[Idx],
                                  Mask[ExpectedIdx]);
     }
     break;
@@ -14306,9 +14328,9 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
 
   // At this point, each half should contain all its inputs, and we can then
   // just shuffle them into their final position.
-  assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
+  assert(none_of(LoMask, [](int M) { return M >= 4; }) &&
          "Failed to lift all the high half inputs to the low mask!");
-  assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
+  assert(none_of(HiMask, [](int M) { return M >= 0 && M < 4; }) &&
          "Failed to lift all the low half inputs to the high mask!");
 
   // Do a half shuffle for the low mask.
@@ -52292,59 +52314,6 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
 }
 
-static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,
-                                   SDValue And1_L, SDValue And1_R,
-                                   const SDLoc &DL, SelectionDAG &DAG) {
-  if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
-    return SDValue();
-  SDValue NotOp = And0_L->getOperand(0);
-  if (NotOp == And1_R)
-    std::swap(And1_R, And1_L);
-  if (NotOp != And1_L)
-    return SDValue();
-
-  // (~(NotOp) & And0_R) | (NotOp & And1_R)
-  // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
-  EVT VT = And1_L->getValueType(0);
-  SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
-  SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
-  SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
-  SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
-  return Xor1;
-}
-
-/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
-/// equivalent `((x ^ y) & m) ^ y)` pattern.
-/// This is typically a better representation for  targets without a fused
-/// "and-not" operation. This function is intended to be called from a
-/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
-static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {
-  // Note that masked-merge variants using XOR or ADD expressions are
-  // normalized to OR by InstCombine so we only check for OR.
-  assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
-  SDValue N0 = Node->getOperand(0);
-  if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
-    return SDValue();
-  SDValue N1 = Node->getOperand(1);
-  if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
-    return SDValue();
-
-  SDLoc DL(Node);
-  SDValue N00 = N0->getOperand(0);
-  SDValue N01 = N0->getOperand(1);
-  SDValue N10 = N1->getOperand(0);
-  SDValue N11 = N1->getOperand(1);
-  if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
-    return Result;
-  return SDValue();
-}
-
 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
 /// with CMP+{ADC, SBB}.
@@ -52748,11 +52717,6 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // We should fold "masked merge" patterns when `andn` is not available.
-  if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
-    if (SDValue R = foldMaskedMerge(N, DAG))
-      return R;
-
   if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
     return R;
 
@@ -55354,10 +55318,17 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
+  auto IsMinMaxLegal = [&](EVT VT) {
+    if (!TLI.isTypeLegal(VT))
+      return false;
+    return VT.getScalarType() != MVT::f16 ||
+           (Subtarget.hasFP16() && (VT == MVT::v32f16 || Subtarget.hasVLX()));
+  };
+
   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
         (Subtarget.hasSSE2() && VT == MVT::f64) ||
         (Subtarget.hasFP16() && VT == MVT::f16) ||
-        (VT.isVector() && TLI.isTypeLegal(VT))))
+        (VT.isVector() && IsMinMaxLegal(VT))))
     return SDValue();
 
   SDValue Op0 = N->getOperand(0);
@@ -58110,21 +58081,31 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
-  // (sub Y, (sext (vXi1 X))).
-  // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
-  // generic DAG combine without a legal type check, but adding this there
-  // caused regressions.
   if (VT.isVector()) {
     SDValue X, Y;
     EVT BoolVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
                                   VT.getVectorElementCount());
+
+    // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
+    // (sub Y, (sext (vXi1 X))).
+    // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y)
+    // in generic DAG combine without a legal type check, but adding this there
+    // caused regressions.
     if (DAG.getTargetLoweringInfo().isTypeLegal(BoolVT) &&
         sd_match(N, m_Add(m_ZExt(m_AllOf(m_SpecificVT(BoolVT), m_Value(X))),
                           m_Value(Y)))) {
       SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, X);
       return DAG.getNode(ISD::SUB, DL, VT, Y, SExt);
     }
+
+    // Fold (add X, (srl Y, 7)) -> (sub X, (icmp_sgt 0, Y)) to undo instcombine
+    // canonicalisation as we don't have good vXi8 shifts.
+    if (VT.getScalarType() == MVT::i8 &&
+        sd_match(N, m_Add(m_Value(X), m_Srl(m_Value(Y), m_SpecificInt(7))))) {
+      SDValue Cmp =
+          DAG.getSetCC(DL, BoolVT, DAG.getConstant(0, DL, VT), Y, ISD::SETGT);
+      return DAG.getNode(ISD::SUB, DL, VT, X, DAG.getSExtOrTrunc(Cmp, DL, VT));
+    }
   }
 
   // Peephole for 512-bit VPDPBSSD on non-VLX targets.
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 53d78ed..eb52801 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -41,8 +41,6 @@
 
 using namespace llvm;
 
-extern cl::opt<bool> UseNewDbgInfoFormat;
-
 #define DEBUG_TYPE "coro-frame"
 
 namespace {
@@ -842,18 +840,12 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape,
       DILocation::get(DIS->getContext(), LineNum, /*Column=*/1, DIS);
   assert(FrameDIVar->isValidLocationForIntrinsic(DILoc));
 
-  if (UseNewDbgInfoFormat) {
-    DbgVariableRecord *NewDVR =
-        new DbgVariableRecord(ValueAsMetadata::get(Shape.FramePtr), FrameDIVar,
-                              DBuilder.createExpression(), DILoc,
-                              DbgVariableRecord::LocationType::Declare);
-    BasicBlock::iterator It = Shape.getInsertPtAfterFramePtr();
-    It->getParent()->insertDbgRecordBefore(NewDVR, It);
-  } else {
-    DBuilder.insertDeclare(Shape.FramePtr, FrameDIVar,
-                           DBuilder.createExpression(), DILoc,
-                           Shape.getInsertPtAfterFramePtr());
-  }
+  DbgVariableRecord *NewDVR =
+      new DbgVariableRecord(ValueAsMetadata::get(Shape.FramePtr), FrameDIVar,
+                            DBuilder.createExpression(), DILoc,
+                            DbgVariableRecord::LocationType::Declare);
+  BasicBlock::iterator It = Shape.getInsertPtAfterFramePtr();
+  It->getParent()->insertDbgRecordBefore(NewDVR, It);
 }
 
 // Build a struct that will keep state for an active coroutine.
@@ -1132,23 +1124,15 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
         }
 
         auto SalvageOne = [&](auto *DDI) {
-          bool AllowUnresolved = false;
           // This dbg.declare is preserved for all coro-split function
           // fragments. It will be unreachable in the main function, and
           // processed by coro::salvageDebugInfo() by the Cloner.
-          if (UseNewDbgInfoFormat) {
-            DbgVariableRecord *NewDVR = new DbgVariableRecord(
-                ValueAsMetadata::get(CurrentReload), DDI->getVariable(),
-                DDI->getExpression(), DDI->getDebugLoc(),
-                DbgVariableRecord::LocationType::Declare);
-            Builder.GetInsertPoint()->getParent()->insertDbgRecordBefore(
-                NewDVR, Builder.GetInsertPoint());
-          } else {
-            DIBuilder(*CurrentBlock->getParent()->getParent(), AllowUnresolved)
-                .insertDeclare(CurrentReload, DDI->getVariable(),
-                               DDI->getExpression(), DDI->getDebugLoc(),
-                               Builder.GetInsertPoint());
-          }
+          DbgVariableRecord *NewDVR = new DbgVariableRecord(
+              ValueAsMetadata::get(CurrentReload), DDI->getVariable(),
+              DDI->getExpression(), DDI->getDebugLoc(),
+              DbgVariableRecord::LocationType::Declare);
+          Builder.GetInsertPoint()->getParent()->insertDbgRecordBefore(
+              NewDVR, Builder.GetInsertPoint());
           // This dbg.declare is for the main function entry point.  It
           // will be deleted in all coro-split functions.
           coro::salvageDebugInfo(ArgToAllocaMap, *DDI, false /*UseEntryValue*/);
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 1e908b7..3799a69 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -12592,19 +12592,38 @@ struct AAAddressSpaceImpl : public AAAddressSpace {
 
   ChangeStatus updateImpl(Attributor &A) override {
     uint32_t OldAddressSpace = AssumedAddressSpace;
+    unsigned FlatAS = A.getInfoCache().getFlatAddressSpace().value();
 
     auto CheckAddressSpace = [&](Value &Obj) {
+      // Ignore undef.
       if (isa<UndefValue>(&Obj))
         return true;
-      if (auto *Arg = dyn_cast<Argument>(&Obj)) {
+
+      // If the object already has a non-flat address space, we simply take it.
+      unsigned ObjAS = Obj.getType()->getPointerAddressSpace();
+      if (ObjAS != FlatAS)
+        return takeAddressSpace(ObjAS);
+
+      // At this point, we know Obj is in the flat address space. For a final
+      // attempt, we want to use getAssumedAddrSpace, but first we must get the
+      // associated function, if possible.
+      Function *F = nullptr;
+      if (auto *Arg = dyn_cast<Argument>(&Obj))
+        F = Arg->getParent();
+      else if (auto *I = dyn_cast<Instruction>(&Obj))
+        F = I->getFunction();
+
+      // Use getAssumedAddrSpace if the associated function exists.
+      if (F) {
         auto *TTI =
-            A.getInfoCache().getAnalysisResultForFunction<TargetIRAnalysis>(
-                *Arg->getParent());
-        unsigned AssumedAS = TTI->getAssumedAddrSpace(Arg);
+            A.getInfoCache().getAnalysisResultForFunction<TargetIRAnalysis>(*F);
+        unsigned AssumedAS = TTI->getAssumedAddrSpace(&Obj);
         if (AssumedAS != ~0U)
           return takeAddressSpace(AssumedAS);
       }
-      return takeAddressSpace(Obj.getType()->getPointerAddressSpace());
+
+      // Now we can't do anything else but to take the flat AS.
+      return takeAddressSpace(FlatAS);
     };
 
     auto *AUO = A.getOrCreateAAFor<AAUnderlyingObjects>(getIRPosition(), this,
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 8d8a737..ab67a09 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -1173,7 +1173,7 @@ void LowerTypeTestsModule::lowerTypeTestCalls(
     if (BSI.isAllOnes()) {
       TIL.TheKind = (BSI.BitSize == 1) ? TypeTestResolution::Single
                                        : TypeTestResolution::AllOnes;
-    } else if (BSI.BitSize <= 64) {
+    } else if (BSI.BitSize <= IntPtrTy->getBitWidth()) {
       TIL.TheKind = TypeTestResolution::Inline;
       uint64_t InlineBits = 0;
       for (auto Bit : BSI.Bits)
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index b2d409a..dd7ae7e 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -5232,8 +5232,7 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
         IRPosition::callsite_returned(CB),
         [&](const IRPosition &IRP, const AbstractAttribute *AA,
             bool &UsedAssumedInformation) -> std::optional<Value *> {
-          assert((isValidState() ||
-                  (SimplifiedValue && *SimplifiedValue == nullptr)) &&
+          assert((isValidState() || SimplifiedValue == nullptr) &&
                  "Unexpected invalid state!");
 
           if (!isAtFixpoint()) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index cfb4af3..c169ab25 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3076,10 +3076,16 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
   case Intrinsic::arm_neon_aesd:
   case Intrinsic::arm_neon_aese:
   case Intrinsic::aarch64_crypto_aesd:
-  case Intrinsic::aarch64_crypto_aese: {
+  case Intrinsic::aarch64_crypto_aese:
+  case Intrinsic::aarch64_sve_aesd:
+  case Intrinsic::aarch64_sve_aese: {
     Value *DataArg = II->getArgOperand(0);
     Value *KeyArg  = II->getArgOperand(1);
 
+    // Accept zero on either operand.
+    if (!match(KeyArg, m_ZeroInt()))
+      std::swap(KeyArg, DataArg);
+
     // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR
     Value *Data, *Key;
     if (match(KeyArg, m_ZeroInt()) &&
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 8a6f105..c2315d5 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -4167,6 +4167,23 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOrigin(&I, PtrSrcOrigin);
   }
 
+  // Instrument AVX permutation intrinsic.
+  // We apply the same permutation (argument index 1) to the shadow.
+  void handleAVXVpermilvar(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Shadow = getShadow(&I, 0);
+    insertShadowCheck(I.getArgOperand(1), &I);
+
+    // Shadows are integer-ish types but some intrinsics require a
+    // different (e.g., floating-point) type.
+    Shadow = IRB.CreateBitCast(Shadow, I.getArgOperand(0)->getType());
+    CallInst *CI = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(),
+                                       {Shadow, I.getArgOperand(1)});
+
+    setShadow(&I, IRB.CreateBitCast(CI, getShadowTy(&I)));
+    setOriginForNaryOp(I);
+  }
+
   // Instrument BMI / BMI2 intrinsics.
   // All of these intrinsics are Z = I(X, Y)
   // where the types of all operands and the result match, and are either i32 or
@@ -5112,6 +5129,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       break;
     }
 
+    case Intrinsic::x86_avx_vpermilvar_pd:
+    case Intrinsic::x86_avx_vpermilvar_pd_256:
+    case Intrinsic::x86_avx512_vpermilvar_pd_512:
+    case Intrinsic::x86_avx_vpermilvar_ps:
+    case Intrinsic::x86_avx_vpermilvar_ps_256:
+    case Intrinsic::x86_avx512_vpermilvar_ps_512: {
+      handleAVXVpermilvar(I);
+      break;
+    }
+
     case Intrinsic::x86_avx512fp16_mask_add_sh_round:
     case Intrinsic::x86_avx512fp16_mask_sub_sh_round:
     case Intrinsic::x86_avx512fp16_mask_mul_sh_round:
diff --git a/llvm/lib/Transforms/ObjCARC/BlotMapVector.h b/llvm/lib/Transforms/ObjCARC/BlotMapVector.h
index 2fa07cf..0e64cca 100644
--- a/llvm/lib/Transforms/ObjCARC/BlotMapVector.h
+++ b/llvm/lib/Transforms/ObjCARC/BlotMapVector.h
@@ -53,8 +53,7 @@ public:
   const_iterator end() const { return Vector.end(); }
 
   ValueT &operator[](const KeyT &Arg) {
-    std::pair<typename MapTy::iterator, bool> Pair =
-        Map.insert(std::make_pair(Arg, size_t(0)));
+    std::pair<typename MapTy::iterator, bool> Pair = Map.try_emplace(Arg);
     if (Pair.second) {
       size_t Num = Vector.size();
       Pair.first->second = Num;
diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index dd4d4ef..07bc623 100644
--- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -381,7 +381,7 @@ void ConstantHoistingPass::collectConstantCandidates(
     ConstCandMapType::iterator Itr;
     bool Inserted;
     ConstPtrUnionType Cand = ConstInt;
-    std::tie(Itr, Inserted) = ConstCandMap.insert(std::make_pair(Cand, 0));
+    std::tie(Itr, Inserted) = ConstCandMap.try_emplace(Cand);
     if (Inserted) {
       ConstIntCandVec.push_back(ConstantCandidate(ConstInt));
       Itr->second = ConstIntCandVec.size() - 1;
@@ -439,7 +439,7 @@ void ConstantHoistingPass::collectConstantCandidates(
   ConstCandMapType::iterator Itr;
   bool Inserted;
   ConstPtrUnionType Cand = ConstExpr;
-  std::tie(Itr, Inserted) = ConstCandMap.insert(std::make_pair(Cand, 0));
+  std::tie(Itr, Inserted) = ConstCandMap.try_emplace(Cand);
   if (Inserted) {
     ExprCandVec.push_back(ConstantCandidate(
         ConstantInt::get(Type::getInt32Ty(*Ctx), Offset.getLimitedValue()),
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index cbad5dd..2786d81 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -1010,9 +1010,9 @@ void State::addInfoForInductions(BasicBlock &BB) {
   auto IncUnsigned = SE.getMonotonicPredicateType(AR, CmpInst::ICMP_UGT);
   auto IncSigned = SE.getMonotonicPredicateType(AR, CmpInst::ICMP_SGT);
   bool MonotonicallyIncreasingUnsigned =
-      IncUnsigned && *IncUnsigned == ScalarEvolution::MonotonicallyIncreasing;
+      IncUnsigned == ScalarEvolution::MonotonicallyIncreasing;
   bool MonotonicallyIncreasingSigned =
-      IncSigned && *IncSigned == ScalarEvolution::MonotonicallyIncreasing;
+      IncSigned == ScalarEvolution::MonotonicallyIncreasing;
   // If SCEV guarantees that AR does not wrap, PN >= StartValue can be added
   // unconditionally.
   if (MonotonicallyIncreasingUnsigned)
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index afac49a..242e571 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -2788,7 +2788,7 @@ std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr,
   }
 
   std::pair<UseMapTy::iterator, bool> P =
-    UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
+      UseMap.try_emplace(LSRUse::SCEVUseKindPair(Expr, Kind));
   if (!P.second) {
     // A use already existed with this base.
     size_t LUIdx = P.first->second;
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 20279bf..5a51824 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -232,6 +232,15 @@ static bool isUniformShape(Value *V) {
   if (I->isBinaryOp())
     return true;
 
+  if (auto *II = dyn_cast<IntrinsicInst>(V))
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::abs:
+    case Intrinsic::fabs:
+      return true;
+    default:
+      return false;
+    }
+
   switch (I->getOpcode()) {
   case Instruction::FNeg:
     return true;
@@ -618,7 +627,7 @@ public:
       case Intrinsic::matrix_column_major_store:
         return true;
       default:
-        return false;
+        return isUniformShape(II);
       }
     return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V);
   }
@@ -1064,8 +1073,8 @@ public:
         VisitBinaryOperator(BinOp, SI);
       else if (auto *UnOp = dyn_cast<UnaryOperator>(Inst))
         VisitUnaryOperator(UnOp, SI);
-      else if (CallInst *CInst = dyn_cast<CallInst>(Inst))
-        VisitCallInst(CInst);
+      else if (IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Inst))
+        VisitIntrinsicInst(Intr, SI);
       else if (match(Inst, m_Load(m_Value(Op1))))
         VisitLoad(cast<LoadInst>(Inst), SI, Op1, Builder);
       else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2))))
@@ -1111,23 +1120,48 @@ public:
   }
 
   /// Replace intrinsic calls.
-  void VisitCallInst(CallInst *Inst) {
-    assert(Inst->getCalledFunction() &&
-           Inst->getCalledFunction()->isIntrinsic());
-
-    switch (Inst->getCalledFunction()->getIntrinsicID()) {
+  void VisitIntrinsicInst(IntrinsicInst *Inst, const ShapeInfo &Shape) {
+    switch (Inst->getIntrinsicID()) {
     case Intrinsic::matrix_multiply:
       LowerMultiply(Inst);
-      break;
+      return;
     case Intrinsic::matrix_transpose:
       LowerTranspose(Inst);
-      break;
+      return;
     case Intrinsic::matrix_column_major_load:
       LowerColumnMajorLoad(Inst);
-      break;
+      return;
     case Intrinsic::matrix_column_major_store:
       LowerColumnMajorStore(Inst);
-      break;
+      return;
+    case Intrinsic::abs:
+    case Intrinsic::fabs: {
+      IRBuilder<> Builder(Inst);
+      MatrixTy Result;
+      MatrixTy M = getMatrix(Inst->getOperand(0), Shape, Builder);
+      Builder.setFastMathFlags(getFastMathFlags(Inst));
+
+      for (auto &Vector : M.vectors()) {
+        switch (Inst->getIntrinsicID()) {
+        case Intrinsic::abs:
+          Result.addVector(Builder.CreateBinaryIntrinsic(Intrinsic::abs, Vector,
+                                                         Inst->getOperand(1)));
+          continue;
+        case Intrinsic::fabs:
+          Result.addVector(
+              Builder.CreateUnaryIntrinsic(Inst->getIntrinsicID(), Vector));
+          continue;
+        default:
+          llvm_unreachable("unexpected intrinsic");
+        }
+      }
+
+      finalizeLowering(Inst,
+                       Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
+                                               Result.getNumVectors()),
+                       Builder);
+      return;
+    }
     default:
       llvm_unreachable(
           "only intrinsics supporting shape info should be seen here");
diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index 3e323cc..729813a 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -705,7 +705,7 @@ bool diagnoseMisSizedDbgValue(Module &M, DbgValTy *DbgVal) {
   bool HasBadSize = false;
   if (Ty->isIntegerTy()) {
     auto Signedness = DbgVal->getVariable()->getSignedness();
-    if (Signedness && *Signedness == DIBasicType::Signedness::Signed)
+    if (Signedness == DIBasicType::Signedness::Signed)
       HasBadSize = ValueOperandSize < *DbgVarSize;
   } else {
     HasBadSize = ValueOperandSize != *DbgVarSize;
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index be71cb6..0184a1e 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -88,8 +88,6 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
-extern cl::opt<bool> UseNewDbgInfoFormat;
-
 #define DEBUG_TYPE "local"
 
 STATISTIC(NumRemoved, "Number of unreachable basic blocks removed");
@@ -1548,9 +1546,9 @@ Align llvm::tryEnforceAlignment(Value *V, Align PrefAlign,
     return PrefAlign;
   }
 
-  if (auto *GO = dyn_cast<GlobalObject>(V)) {
+  if (auto *GV = dyn_cast<GlobalVariable>(V)) {
     // TODO: as above, this shouldn't be necessary.
-    Align CurrentAlign = GO->getPointerAlignment(DL);
+    Align CurrentAlign = GV->getPointerAlignment(DL);
     if (PrefAlign <= CurrentAlign)
       return CurrentAlign;
 
@@ -1558,16 +1556,16 @@ Align llvm::tryEnforceAlignment(Value *V, Align PrefAlign,
     // of the global.  If the memory we set aside for the global may not be the
     // memory used by the final program then it is impossible for us to reliably
     // enforce the preferred alignment.
-    if (!GO->canIncreaseAlignment())
+    if (!GV->canIncreaseAlignment())
       return CurrentAlign;
 
-    if (GO->isThreadLocal()) {
-      unsigned MaxTLSAlign = GO->getParent()->getMaxTLSAlignment() / CHAR_BIT;
+    if (GV->isThreadLocal()) {
+      unsigned MaxTLSAlign = GV->getParent()->getMaxTLSAlignment() / CHAR_BIT;
       if (MaxTLSAlign && PrefAlign > Align(MaxTLSAlign))
         PrefAlign = Align(MaxTLSAlign);
     }
 
-    GO->setAlignment(PrefAlign);
+    GV->setAlignment(PrefAlign);
     return PrefAlign;
   }
 
@@ -1691,16 +1689,10 @@ static void insertDbgValueOrDbgVariableRecord(DIBuilder &Builder, Value *DV,
                                               DIExpression *DIExpr,
                                               const DebugLoc &NewLoc,
                                               BasicBlock::iterator Instr) {
-  if (!UseNewDbgInfoFormat) {
-    Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, Instr);
-  } else {
-    // RemoveDIs: if we're using the new debug-info format, allocate a
-    // DbgVariableRecord directly instead of a dbg.value intrinsic.
-    ValueAsMetadata *DVAM = ValueAsMetadata::get(DV);
-    DbgVariableRecord *DV =
-        new DbgVariableRecord(DVAM, DIVar, DIExpr, NewLoc.get());
-    Instr->getParent()->insertDbgRecordBefore(DV, Instr);
-  }
+  ValueAsMetadata *DVAM = ValueAsMetadata::get(DV);
+  DbgVariableRecord *DVRec =
+      new DbgVariableRecord(DVAM, DIVar, DIExpr, NewLoc.get());
+  Instr->getParent()->insertDbgRecordBefore(DVRec, Instr);
 }
 
 static void insertDbgValueOrDbgVariableRecordAfter(
@@ -1838,7 +1830,6 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR,
   // then we want to insert a dbg.value for the corresponding fragment.
   LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: " << *DVR
                     << '\n');
-  assert(UseNewDbgInfoFormat);
 
   // For now, when there is a store to parts of the variable (but we do not
   // know which part) we insert an dbg.value intrinsic to indicate that we
@@ -1919,7 +1910,6 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR, LoadInst *LI,
   // future if multi-location support is added to the IR, it might be
   // preferable to keep tracking both the loaded value and the original
   // address in case the alloca can not be elided.
-  assert(UseNewDbgInfoFormat);
 
   // Create a DbgVariableRecord directly and insert.
   ValueAsMetadata *LIVAM = ValueAsMetadata::get(LI);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index fc8ebeb..333e50e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7218,13 +7218,15 @@ static Value *getStartValueFromReductionResult(VPInstruction *RdxResult) {
   return StartVPV->getLiveInIRValue();
 }
 
-// If \p R is a Compute{Reduction,AnyOf,FindLastIV}Result when vectorizing the
+// If \p EpiResumePhiR is resume VPPhi for a reduction when vectorizing the
 // epilog loop, fix the reduction's scalar PHI node by adding the incoming value
 // from the main vector loop.
 static void fixReductionScalarResumeWhenVectorizingEpilog(
-    VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock,
-    BasicBlock *BypassBlock) {
-  auto *EpiRedResult = dyn_cast<VPInstruction>(R);
+    VPPhi *EpiResumePhiR, VPTransformState &State, BasicBlock *BypassBlock) {
+  // Get the VPInstruction computing the reduction result in the middle block.
+  // The first operand may not be from the middle block if it is not connected
+  // to the scalar preheader. In that case, there's nothing to fix.
+  auto *EpiRedResult = dyn_cast<VPInstruction>(EpiResumePhiR->getOperand(0));
   if (!EpiRedResult ||
       (EpiRedResult->getOpcode() != VPInstruction::ComputeAnyOfResult &&
        EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult &&
@@ -7235,8 +7237,14 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
       cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
   const RecurrenceDescriptor &RdxDesc =
       EpiRedHeaderPhi->getRecurrenceDescriptor();
-  Value *MainResumeValue =
-      EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
+  Value *MainResumeValue;
+  if (auto *VPI = dyn_cast<VPInstruction>(EpiRedHeaderPhi->getStartValue())) {
+    assert((VPI->getOpcode() == VPInstruction::Broadcast ||
+            VPI->getOpcode() == VPInstruction::ReductionStartVector) &&
+           "unexpected start recipe");
+    MainResumeValue = VPI->getOperand(0)->getUnderlyingValue();
+  } else
+    MainResumeValue = EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
   if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
           RdxDesc.getRecurrenceKind())) {
     Value *StartV = EpiRedResult->getOperand(1)->getLiveInIRValue();
@@ -7269,12 +7277,7 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
   // When fixing reductions in the epilogue loop we should already have
   // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
   // over the incoming values correctly.
-  using namespace VPlanPatternMatch;
-  assert(count_if(EpiRedResult->users(), IsaPred<VPPhi>) == 1 &&
-         "ResumePhi must have a single user");
-  auto *EpiResumePhiVPI =
-      cast<VPInstruction>(*find_if(EpiRedResult->users(), IsaPred<VPPhi>));
-  auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true));
+  auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiR, true));
   EpiResumePhi->setIncomingValueForBlock(
       BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
 }
@@ -7383,18 +7386,12 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
       }
     }
     VPBasicBlock *ScalarPH = BestVPlan.getScalarPreheader();
-    ArrayRef<VPBlockBase *> ScalarPreds = ScalarPH->getPredecessors();
-    if (!ScalarPreds.empty()) {
+    if (ScalarPH->getNumPredecessors() > 0) {
       // If ScalarPH has predecessors, we may need to update its reduction
-      // resume values. If there is a middle block, it must be the first
-      // predecessor. Note that the first predecessor may not be the middle
-      // block, if the middle block doesn't branch to the scalar preheader. In
-      // that case, fixReductionScalarResumeWhenVectorizingEpilog will be a
-      // no-op.
-      auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPreds[0]);
-      for (VPRecipeBase &R : *MiddleVPBB) {
-        fixReductionScalarResumeWhenVectorizingEpilog(
-            &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock);
+      // resume values.
+      for (VPRecipeBase &R : ScalarPH->phis()) {
+        fixReductionScalarResumeWhenVectorizingEpilog(cast<VPPhi>(&R), State,
+                                                      BypassBlock);
       }
     }
   }
@@ -9173,7 +9170,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       continue;
 
     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
-    Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
+    Type *PhiTy = PhiR->getUnderlyingValue()->getType();
     // If tail is folded by masking, introduce selects between the phi
     // and the users outside the vector region of each reduction, at the
     // beginning of the dedicated latch block.
@@ -9311,6 +9308,27 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       // start value.
       PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
     }
+    RecurKind RK = RdxDesc.getRecurrenceKind();
+    if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) &&
+         !RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK) &&
+         !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))) {
+      VPBuilder PHBuilder(Plan->getVectorPreheader());
+      VPValue *Iden = Plan->getOrAddLiveIn(
+          getRecurrenceIdentity(RK, PhiTy, RdxDesc.getFastMathFlags()));
+      // If the PHI is used by a partial reduction, set the scale factor.
+      unsigned ScaleFactor =
+          RecipeBuilder.getScalingForReduction(RdxDesc.getLoopExitInstr())
+              .value_or(1);
+      Type *I32Ty = IntegerType::getInt32Ty(PhiTy->getContext());
+      auto *ScaleFactorVPV =
+          Plan->getOrAddLiveIn(ConstantInt::get(I32Ty, ScaleFactor));
+      VPValue *StartV = PHBuilder.createNaryOp(
+          VPInstruction::ReductionStartVector,
+          {PhiR->getStartValue(), Iden, ScaleFactorVPV},
+          PhiTy->isFloatingPointTy() ? RdxDesc.getFastMathFlags()
+                                     : FastMathFlags());
+      PhiR->setOperand(0, StartV);
+    }
   }
   for (VPRecipeBase *R : ToDelete)
     R->eraseFromParent();
@@ -9741,25 +9759,16 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
       // VPlan.
       // FIXME: Improve modeling for canonical IV start values in the epilogue
       // loop.
-      BasicBlock *MainMiddle = find_singleton<BasicBlock>(
-          predecessors(L->getLoopPreheader()),
-          [&EPI](BasicBlock *BB, bool) -> BasicBlock * {
-            if (BB != EPI.MainLoopIterationCountCheck &&
-                BB != EPI.EpilogueIterationCountCheck &&
-                BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck)
-              return BB;
-            return nullptr;
-          });
       using namespace llvm::PatternMatch;
       Type *IdxTy = IV->getScalarType();
       PHINode *EPResumeVal = find_singleton<PHINode>(
           L->getLoopPreheader()->phis(),
-          [&EPI, IdxTy, MainMiddle](PHINode &P, bool) -> PHINode * {
+          [&EPI, IdxTy](PHINode &P, bool) -> PHINode * {
             if (P.getType() == IdxTy &&
-                P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount &&
                 match(
                     P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
-                    m_SpecificInt(0)))
+                    m_SpecificInt(0)) &&
+                is_contained(P.incoming_values(), EPI.VectorTripCount))
               return &P;
             return nullptr;
           });
@@ -9825,6 +9834,15 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
         Value *Cmp = Builder.CreateICmpEQ(ResumeV, ToFrozen[StartV]);
         ResumeV =
             Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV);
+      } else {
+        VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
+        auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
+        if (auto *VPI = dyn_cast<VPInstruction>(PhiR->getStartValue())) {
+          assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
+                 "unexpected start value");
+          VPI->setOperand(0, StartVal);
+          continue;
+        }
       }
     } else {
       // Retrieve the induction resume values for wide inductions from
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 273df55..bbcbfee 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -934,6 +934,10 @@ public:
     /// Scale the first operand (vector step) by the second operand
     /// (scalar-step).  Casts both operands to the result type if needed.
     WideIVStep,
+    /// Start vector for reductions with 3 operands: the original start value,
+    /// the identity value for the reduction and an integer indicating the
+    /// scaling factor.
+    ReductionStartVector,
     // Creates a step vector starting from 0 to VF with a step of 1.
     StepVector,
 
@@ -2231,7 +2235,7 @@ public:
   bool onlyFirstLaneUsed(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
-    return Op == getStartValue();
+    return isOrdered() || isInLoop();
   }
 };
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 81fc93b..76da5b0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -74,6 +74,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
   switch (Opcode) {
   case Instruction::ExtractElement:
   case Instruction::Freeze:
+  case VPInstruction::ReductionStartVector:
     return inferScalarType(R->getOperand(0));
   case Instruction::Select: {
     Type *ResTy = inferScalarType(R->getOperand(1));
@@ -395,6 +396,10 @@ static unsigned getVFScaleFactor(VPRecipeBase *R) {
     return RR->getVFScaleFactor();
   if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
     return RR->getVFScaleFactor();
+  assert(
+      (!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() !=
+                                     VPInstruction::ReductionStartVector) &&
+      "getting scaling factor of reduction-start-vector not implemented yet");
   return 1;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 78ad9b7..62b99d9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -604,6 +604,20 @@ Value *VPInstruction::generate(VPTransformState &State) {
     return Builder.CreateVectorSplat(
         State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
   }
+  case VPInstruction::ReductionStartVector: {
+    if (State.VF.isScalar())
+      return State.get(getOperand(0), true);
+    IRBuilderBase::FastMathFlagGuard FMFG(Builder);
+    Builder.setFastMathFlags(getFastMathFlags());
+    // If this start vector is scaled then it should produce a vector with fewer
+    // elements than the VF.
+    ElementCount VF = State.VF.divideCoefficientBy(
+        cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue());
+    auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
+    Constant *Zero = Builder.getInt32(0);
+    return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
+                                       Zero);
+  }
   case VPInstruction::ComputeAnyOfResult: {
     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
     // and will be removed by breaking up the recipe further.
@@ -899,6 +913,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   case VPInstruction::PtrAdd:
   case VPInstruction::WideIVStep:
   case VPInstruction::StepVector:
+  case VPInstruction::ReductionStartVector:
     return false;
   default:
     return true;
@@ -929,6 +944,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::BranchOnCount:
   case VPInstruction::BranchOnCond:
+  case VPInstruction::ReductionStartVector:
     return true;
   case VPInstruction::PtrAdd:
     return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
@@ -1034,6 +1050,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::FirstActiveLane:
     O << "first-active-lane";
     break;
+  case VPInstruction::ReductionStartVector:
+    O << "reduction-start-vector";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
@@ -1617,6 +1636,7 @@ bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
            Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
            Opcode == Instruction::FCmp || Opcode == Instruction::Select ||
            Opcode == VPInstruction::WideIVStep ||
+           Opcode == VPInstruction::ReductionStartVector ||
            Opcode == VPInstruction::ComputeReductionResult;
   case OperationType::NonNegOp:
     return Opcode == Instruction::ZExt;
@@ -3847,17 +3867,19 @@ void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
 #endif
 
 void VPReductionPHIRecipe::execute(VPTransformState &State) {
-  // If this phi is fed by a scaled reduction then it should output a
-  // vector with fewer elements than the VF.
-  ElementCount VF = State.VF.divideCoefficientBy(VFScaleFactor);
+  // Reductions do not have to start at zero. They can start with
+  // any loop invariant values.
+  VPValue *StartVPV = getStartValue();
 
   // In order to support recurrences we need to be able to vectorize Phi nodes.
   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
   // this value when we vectorize all of the instructions that use the PHI.
-  auto *ScalarTy = State.TypeAnalysis.inferScalarType(this);
+  BasicBlock *VectorPH =
+      State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
-  Type *VecTy = ScalarPHI ? ScalarTy : VectorType::get(ScalarTy, VF);
+  Value *StartV = State.get(StartVPV, ScalarPHI);
+  Type *VecTy = StartV->getType();
 
   BasicBlock *HeaderBB = State.CFG.PrevBB;
   assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
@@ -3866,49 +3888,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
   Phi->insertBefore(HeaderBB->getFirstInsertionPt());
   State.set(this, Phi, IsInLoop);
 
-  BasicBlock *VectorPH =
-      State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
-  // Create start and identity vector values for the reduction in the preheader.
-  // TODO: Introduce recipes in VPlan preheader to create initial values.
-  IRBuilderBase::InsertPointGuard IPBuilder(State.Builder);
-  State.Builder.SetInsertPoint(VectorPH->getTerminator());
-
-  // Reductions do not have to start at zero. They can start with
-  // any loop invariant values.
-  VPValue *StartVPV = getStartValue();
-  RecurKind RK = RdxDesc.getRecurrenceKind();
-  if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
-      RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
-      RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
-    // [I|F]FindLastIV will use a sentinel value to initialize the reduction
-    // phi or the resume value from the main vector loop when vectorizing the
-    // epilogue loop. In the exit block, ComputeReductionResult will generate
-    // checks to verify if the reduction result is the sentinel value. If the
-    // result is the sentinel value, it will be corrected back to the start
-    // value.
-    // TODO: The sentinel value is not always necessary. When the start value is
-    // a constant, and smaller than the start value of the induction variable,
-    // the start value can be directly used to initialize the reduction phi.
-    Phi->addIncoming(State.get(StartVPV, ScalarPHI), VectorPH);
-    return;
-  }
-
-  Value *Iden = getRecurrenceIdentity(RK, VecTy->getScalarType(),
-                                      RdxDesc.getFastMathFlags());
-  unsigned CurrentPart = getUnrollPart(*this);
-  Value *StartV = StartVPV->getLiveInIRValue();
-  if (!ScalarPHI) {
-    if (CurrentPart == 0) {
-      Iden = State.Builder.CreateVectorSplat(VF, Iden);
-      Constant *Zero = State.Builder.getInt32(0);
-      StartV = State.Builder.CreateInsertElement(Iden, StartV, Zero);
-    } else {
-      Iden = State.Builder.CreateVectorSplat(VF, Iden);
-    }
-  }
-
-  Value *StartVal = (CurrentPart == 0) ? StartV : Iden;
-  Phi->addIncoming(StartVal, VectorPH);
+  Phi->addIncoming(StartV, VectorPH);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ea617f0..dc3c7bf 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1153,6 +1153,16 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
       return;
     }
   }
+  // Simplify redundant ReductionStartVector recipes after unrolling.
+  VPValue *StartV;
+  if (match(Def, m_VPInstruction<VPInstruction::ReductionStartVector>(
+                     m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
+    Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
+      auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
+      return PhiR && PhiR->isInLoop();
+    });
+    return;
+  }
 }
 
 void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 335301a..e4c068e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/Intrinsics.h"
 
 using namespace llvm;
+using namespace llvm::VPlanPatternMatch;
 
 namespace {
 
@@ -223,6 +224,22 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
       Copy->addOperand(R);
       Copy->addOperand(getConstantVPV(Part));
     } else if (RdxPhi) {
+      // If the start value is a ReductionStartVector, use the identity value
+      // (second operand) for unrolled parts. If the scaling factor is > 1,
+      // create a new ReductionStartVector with the scale factor and both
+      // operands set to the identity value.
+      if (auto *VPI = dyn_cast<VPInstruction>(RdxPhi->getStartValue())) {
+        assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
+               "unexpected start VPInstruction");
+        if (match(VPI->getOperand(2), m_SpecificInt(1))) {
+          Copy->setOperand(0, VPI->getOperand(1));
+        } else if (Part == 1) {
+          auto *C = VPI->clone();
+          C->setOperand(0, C->getOperand(1));
+          C->insertAfter(VPI);
+          addUniformForAllParts(C);
+        }
+      }
       Copy->addOperand(getConstantVPV(Part));
     } else {
       assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
@@ -233,7 +250,6 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
 
 /// Handle non-header-phi recipes.
 void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
-  using namespace llvm::VPlanPatternMatch;
   if (match(&R, m_BranchOnCond(m_VPValue())) ||
       match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
     return;
@@ -301,7 +317,6 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
   }
 }
 
-using namespace llvm::VPlanPatternMatch;
 void UnrollState::unrollBlock(VPBlockBase *VPB) {
   auto *VPR = dyn_cast<VPRegionBlock>(VPB);
   if (VPR) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-unmergedup.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-unmergedup.mir
new file mode 100644
index 0000000..acfbec0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-unmergedup.mir
@@ -0,0 +1,91 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple aarch64 -run-pass=aarch64-postlegalizer-lowering -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:            unmerge_dup8
+legalized:       true
+body:             |
+  bb.1.entry:
+    ; CHECK-LABEL: name: unmerge_dup8
+    ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[DUPLANE8_:%[0-9]+]]:_(<8 x s8>) = G_DUPLANE8 [[COPY]], [[C]](s64)
+    ; CHECK-NEXT: $d0 = COPY [[DUPLANE8_]](<8 x s8>)
+    ; CHECK-NEXT: $d1 = COPY [[DUPLANE8_]](<8 x s8>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(<16 x s8>) = COPY $q0
+    %1:_(s64) = G_CONSTANT i64 1
+    %2:_(<16 x s8>) = G_DUPLANE8 %0, %1
+    %3:_(<8 x s8>), %4:_(<8 x s8>) = G_UNMERGE_VALUES %2
+    $d0 = COPY %3
+    $d1 = COPY %4
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            unmerge_dup16
+legalized:       true
+body:             |
+  bb.1.entry:
+    ; CHECK-LABEL: name: unmerge_dup16
+    ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[DUPLANE16_:%[0-9]+]]:_(<4 x s16>) = G_DUPLANE16 [[COPY]], [[C]](s64)
+    ; CHECK-NEXT: $d0 = COPY [[DUPLANE16_]](<4 x s16>)
+    ; CHECK-NEXT: $d1 = COPY [[DUPLANE16_]](<4 x s16>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(<8 x s16>) = COPY $q0
+    %1:_(s64) = G_CONSTANT i64 1
+    %2:_(<8 x s16>) = G_DUPLANE16 %0, %1
+    %3:_(<4 x s16>), %4:_(<4 x s16>) = G_UNMERGE_VALUES %2
+    $d0 = COPY %3
+    $d1 = COPY %4
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            unmerge_dup32
+legalized:       true
+body:             |
+  bb.1.entry:
+    ; CHECK-LABEL: name: unmerge_dup32
+    ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[DUPLANE32_:%[0-9]+]]:_(<2 x s32>) = G_DUPLANE32 [[COPY]], [[C]](s64)
+    ; CHECK-NEXT: $d0 = COPY [[DUPLANE32_]](<2 x s32>)
+    ; CHECK-NEXT: $d1 = COPY [[DUPLANE32_]](<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(<4 x s32>) = COPY $q0
+    %1:_(s64) = G_CONSTANT i64 1
+    %2:_(<4 x s32>) = G_DUPLANE32 %0, %1
+    %3:_(<2 x s32>), %4:_(<2 x s32>) = G_UNMERGE_VALUES %2
+    $d0 = COPY %3
+    $d1 = COPY %4
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            unmerge_dup64
+legalized:       true
+body:             |
+  bb.1.entry:
+    ; CHECK-LABEL: name: unmerge_dup64
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[DUPLANE64_:%[0-9]+]]:_(<2 x s64>) = G_DUPLANE64 [[COPY]], [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[DUPLANE64_]](<2 x s64>), [[C1]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[DUPLANE64_]](<2 x s64>), [[C2]](s64)
+    ; CHECK-NEXT: $d0 = COPY [[EVEC]](s64)
+    ; CHECK-NEXT: $d1 = COPY [[EVEC1]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(<2 x s64>) = COPY $q0
+    %1:_(s64) = G_CONSTANT i64 1
+    %2:_(<2 x s64>) = G_DUPLANE64 %0, %1
+    %3:_(s64), %4:_(s64) = G_UNMERGE_VALUES %2
+    $d0 = COPY %3
+    $d1 = COPY %4
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-ext.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-ext.mir
index 14d44d8..8dedb26 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-ext.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-ext.mir
@@ -255,7 +255,8 @@ body:             |
     ; CHECK: liveins: $q0, $q1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %v1:_(<8 x s16>) = COPY $q0
-    ; CHECK-NEXT: %shuf:_(<8 x s16>) = G_REV64 %v1
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %shuf:_(<8 x s16>) = G_DUPLANE16 %v1, [[C]](s64)
     ; CHECK-NEXT: $q0 = COPY %shuf(<8 x s16>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %v1:_(<8 x s16>) = COPY $q0
@@ -298,8 +299,8 @@ body:             |
     ; CHECK: liveins: $q0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %v1:_(<2 x s64>) = COPY $q0
-    ; CHECK-NEXT: %v2:_(<2 x s64>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: %shuf:_(<2 x s64>) = G_TRN2 %v1, %v2
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %shuf:_(<2 x s64>) = G_DUPLANE64 %v1, [[C]](s64)
     ; CHECK-NEXT: $q0 = COPY %shuf(<2 x s64>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %v1:_(<2 x s64>) = COPY $q0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-rev.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-rev.mir
index c5a6030..1d24f8ac 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-rev.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-rev.mir
@@ -38,8 +38,11 @@ body:             |
     ; CHECK: liveins: $d0, $d1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
-    ; CHECK-NEXT: [[REV64_:%[0-9]+]]:_(<2 x s32>) = G_REV64 [[COPY]]
-    ; CHECK-NEXT: $d0 = COPY [[REV64_]](<2 x s32>)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[COPY]](<2 x s32>), [[DEF]](<2 x s32>)
+    ; CHECK-NEXT: [[DUPLANE32_:%[0-9]+]]:_(<2 x s32>) = G_DUPLANE32 [[CONCAT_VECTORS]], [[C]](s64)
+    ; CHECK-NEXT: $d0 = COPY [[DUPLANE32_]](<2 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
     %0:_(<2 x s32>) = COPY $d0
     %1:_(<2 x s32>) = COPY $d1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir
index 35bc36d..4e569e0b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir
@@ -94,7 +94,9 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr(<4 x s16>) = COPY $d0
     ; CHECK-NEXT: [[C:%[0-9]+]]:gpr(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:fpr(s16) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s16>), [[C]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:fpr(<4 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:fpr(<8 x s16>) = G_CONCAT_VECTORS [[COPY]](<4 x s16>), [[DEF]](<4 x s16>)
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:fpr(s16) = G_EXTRACT_VECTOR_ELT [[CONCAT_VECTORS]](<8 x s16>), [[C]](s64)
     ; CHECK-NEXT: $h0 = COPY [[EVEC]](s16)
     ; CHECK-NEXT: RET_ReallyLR implicit $h0
     %0:_(<4 x s16>) = COPY $d0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
index 7f922c0..287344b 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
@@ -70,6 +70,9 @@ define <1 x i32> @test_bitf_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
 ;
 ; CHECK-GI-LABEL: test_bitf_v1i32:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    fmov w8, s2
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    fmov w10, s0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
index b8eb826..73fcee5 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
@@ -70,6 +70,9 @@ define <1 x i32> @test_bit_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
 ;
 ; CHECK-GI-LABEL: test_bit_v1i32:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    fmov w8, s2
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    fmov w10, s0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mull-masks.ll b/llvm/test/CodeGen/AArch64/aarch64-mull-masks.ll
index 8de1fc5..bf73aeb 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mull-masks.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-mull-masks.ll
@@ -1291,21 +1291,13 @@ entry:
 }
 
 define i64 @umull_ldrb_h(ptr %x0, i16 %x1) {
-; CHECK-SD-LABEL: umull_ldrb_h:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-SD-NEXT:    and x9, x1, #0xffff
-; CHECK-SD-NEXT:    umull x0, w8, w9
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umull_ldrb_h:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-GI-NEXT:    and x9, x1, #0xffff
-; CHECK-GI-NEXT:    mul x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umull_ldrb_h:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x9, x1, #0xffff
+; CHECK-NEXT:    umull x0, w8, w9
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i8, ptr %x0
   %zext = zext i8 %ext64 to i64
@@ -1315,21 +1307,13 @@ entry:
 }
 
 define i64 @umull_ldrb_h_commuted(ptr %x0, i16 %x1) {
-; CHECK-SD-LABEL: umull_ldrb_h_commuted:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-SD-NEXT:    and x9, x1, #0xffff
-; CHECK-SD-NEXT:    umull x0, w9, w8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umull_ldrb_h_commuted:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-GI-NEXT:    and x9, x1, #0xffff
-; CHECK-GI-NEXT:    mul x0, x9, x8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umull_ldrb_h_commuted:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x9, x1, #0xffff
+; CHECK-NEXT:    umull x0, w9, w8
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i8, ptr %x0
   %zext = zext i8 %ext64 to i64
@@ -1339,18 +1323,11 @@ entry:
 }
 
 define i64 @umull_ldrh_w(ptr %x0, i32 %x1) {
-; CHECK-SD-LABEL: umull_ldrh_w:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrh w8, [x0]
-; CHECK-SD-NEXT:    umull x0, w8, w1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umull_ldrh_w:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrh w8, [x0]
-; CHECK-GI-NEXT:    mov w9, w1
-; CHECK-GI-NEXT:    mul x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umull_ldrh_w:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    umull x0, w8, w1
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i16, ptr %x0
   %zext = zext i16 %ext64 to i64
@@ -1360,21 +1337,13 @@ entry:
 }
 
 define i64 @umull_ldr_b(ptr %x0, i8 %x1) {
-; CHECK-SD-LABEL: umull_ldr_b:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr w8, [x0]
-; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-SD-NEXT:    and x9, x1, #0xff
-; CHECK-SD-NEXT:    umull x0, w8, w9
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umull_ldr_b:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr w8, [x0]
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-GI-NEXT:    and x9, x1, #0xff
-; CHECK-GI-NEXT:    mul x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umull_ldr_b:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x9, x1, #0xff
+; CHECK-NEXT:    umull x0, w8, w9
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i32, ptr %x0
   %zext = zext i32 %ext64 to i64
@@ -1384,18 +1353,11 @@ entry:
 }
 
 define i64 @umull_ldr2_w(ptr %x0, i32 %x1) {
-; CHECK-SD-LABEL: umull_ldr2_w:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr w8, [x0]
-; CHECK-SD-NEXT:    umull x0, w8, w1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umull_ldr2_w:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr w8, [x0]
-; CHECK-GI-NEXT:    mov w9, w1
-; CHECK-GI-NEXT:    mul x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umull_ldr2_w:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    umull x0, w8, w1
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i64, ptr %x0
   %and = and i64 %ext64, 4294967295
@@ -1405,19 +1367,12 @@ entry:
 }
 
 define i64 @umull_ldr2_ldr2(ptr %x0, ptr %x1) {
-; CHECK-SD-LABEL: umull_ldr2_ldr2:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr w8, [x0]
-; CHECK-SD-NEXT:    ldr w9, [x1]
-; CHECK-SD-NEXT:    umull x0, w8, w9
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umull_ldr2_ldr2:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr w8, [x0]
-; CHECK-GI-NEXT:    ldr w9, [x1]
-; CHECK-GI-NEXT:    mul x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umull_ldr2_ldr2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    umull x0, w8, w9
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i64, ptr %x0
   %and = and i64 %ext64, 4294967295
@@ -1428,18 +1383,11 @@ entry:
 }
 
 define i64 @umull_ldr2_d(ptr %x0, i64 %x1) {
-; CHECK-SD-LABEL: umull_ldr2_d:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr w8, [x0]
-; CHECK-SD-NEXT:    umull x0, w8, w1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umull_ldr2_d:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr w8, [x0]
-; CHECK-GI-NEXT:    mov w9, w1
-; CHECK-GI-NEXT:    mul x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umull_ldr2_d:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    umull x0, w8, w1
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i64, ptr %x0
   %and = and i64 %ext64, 4294967295
@@ -1449,21 +1397,13 @@ entry:
 }
 
 define i64 @umaddl_ldrb_h(ptr %x0, i16 %x1, i64 %x2) {
-; CHECK-SD-LABEL: umaddl_ldrb_h:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-SD-NEXT:    and x9, x1, #0xffff
-; CHECK-SD-NEXT:    umaddl x0, w8, w9, x2
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umaddl_ldrb_h:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-GI-NEXT:    and x9, x1, #0xffff
-; CHECK-GI-NEXT:    madd x0, x8, x9, x2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umaddl_ldrb_h:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x9, x1, #0xffff
+; CHECK-NEXT:    umaddl x0, w8, w9, x2
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i8, ptr %x0
   %zext = zext i8 %ext64 to i64
@@ -1474,21 +1414,13 @@ entry:
 }
 
 define i64 @umaddl_ldrb_h_commuted(ptr %x0, i16 %x1, i64 %x2) {
-; CHECK-SD-LABEL: umaddl_ldrb_h_commuted:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-SD-NEXT:    and x9, x1, #0xffff
-; CHECK-SD-NEXT:    umaddl x0, w9, w8, x2
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umaddl_ldrb_h_commuted:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-GI-NEXT:    and x9, x1, #0xffff
-; CHECK-GI-NEXT:    madd x0, x9, x8, x2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umaddl_ldrb_h_commuted:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x9, x1, #0xffff
+; CHECK-NEXT:    umaddl x0, w9, w8, x2
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i8, ptr %x0
   %zext = zext i8 %ext64 to i64
@@ -1499,18 +1431,11 @@ entry:
 }
 
 define i64 @umaddl_ldrh_w(ptr %x0, i32 %x1, i64 %x2) {
-; CHECK-SD-LABEL: umaddl_ldrh_w:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrh w8, [x0]
-; CHECK-SD-NEXT:    umaddl x0, w8, w1, x2
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umaddl_ldrh_w:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrh w8, [x0]
-; CHECK-GI-NEXT:    mov w9, w1
-; CHECK-GI-NEXT:    madd x0, x8, x9, x2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umaddl_ldrh_w:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    umaddl x0, w8, w1, x2
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i16, ptr %x0
   %zext = zext i16 %ext64 to i64
@@ -1521,21 +1446,13 @@ entry:
 }
 
 define i64 @umaddl_ldr_b(ptr %x0, i8 %x1, i64 %x2) {
-; CHECK-SD-LABEL: umaddl_ldr_b:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr w8, [x0]
-; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-SD-NEXT:    and x9, x1, #0xff
-; CHECK-SD-NEXT:    umaddl x0, w8, w9, x2
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umaddl_ldr_b:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr w8, [x0]
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-GI-NEXT:    and x9, x1, #0xff
-; CHECK-GI-NEXT:    madd x0, x8, x9, x2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umaddl_ldr_b:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x9, x1, #0xff
+; CHECK-NEXT:    umaddl x0, w8, w9, x2
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i32, ptr %x0
   %zext = zext i32 %ext64 to i64
@@ -1546,18 +1463,11 @@ entry:
 }
 
 define i64 @umaddl_ldr2_w(ptr %x0, i32 %x1, i64 %x2) {
-; CHECK-SD-LABEL: umaddl_ldr2_w:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr w8, [x0]
-; CHECK-SD-NEXT:    umaddl x0, w8, w1, x2
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umaddl_ldr2_w:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr w8, [x0]
-; CHECK-GI-NEXT:    mov w9, w1
-; CHECK-GI-NEXT:    madd x0, x8, x9, x2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umaddl_ldr2_w:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    umaddl x0, w8, w1, x2
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i64, ptr %x0
   %and = and i64 %ext64, 4294967295
@@ -1568,19 +1478,12 @@ entry:
 }
 
 define i64 @umaddl_ldr2_ldr2(ptr %x0, ptr %x1, i64 %x2) {
-; CHECK-SD-LABEL: umaddl_ldr2_ldr2:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr w8, [x0]
-; CHECK-SD-NEXT:    ldr w9, [x1]
-; CHECK-SD-NEXT:    umaddl x0, w8, w9, x2
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umaddl_ldr2_ldr2:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr w8, [x0]
-; CHECK-GI-NEXT:    ldr w9, [x1]
-; CHECK-GI-NEXT:    madd x0, x8, x9, x2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umaddl_ldr2_ldr2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    umaddl x0, w8, w9, x2
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i64, ptr %x0
   %and = and i64 %ext64, 4294967295
@@ -1592,18 +1495,11 @@ entry:
 }
 
 define i64 @umaddl_ldr2_d(ptr %x0, i64 %x1, i64 %x2) {
-; CHECK-SD-LABEL: umaddl_ldr2_d:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr w8, [x0]
-; CHECK-SD-NEXT:    umaddl x0, w8, w1, x2
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umaddl_ldr2_d:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr w8, [x0]
-; CHECK-GI-NEXT:    mov w9, w1
-; CHECK-GI-NEXT:    madd x0, x8, x9, x2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umaddl_ldr2_d:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    umaddl x0, w8, w1, x2
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i64, ptr %x0
   %and = and i64 %ext64, 4294967295
@@ -1614,21 +1510,13 @@ entry:
 }
 
 define i64 @umnegl_ldrb_h(ptr %x0, i16 %x1) {
-; CHECK-SD-LABEL: umnegl_ldrb_h:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-SD-NEXT:    and x9, x1, #0xffff
-; CHECK-SD-NEXT:    umnegl x0, w8, w9
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umnegl_ldrb_h:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-GI-NEXT:    and x9, x1, #0xffff
-; CHECK-GI-NEXT:    mneg x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umnegl_ldrb_h:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x9, x1, #0xffff
+; CHECK-NEXT:    umnegl x0, w8, w9
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i8, ptr %x0
   %zext = zext i8 %ext64 to i64
@@ -1639,21 +1527,13 @@ entry:
 }
 
 define i64 @umnegl_ldrb_h_commuted(ptr %x0, i16 %x1) {
-; CHECK-SD-LABEL: umnegl_ldrb_h_commuted:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-SD-NEXT:    and x9, x1, #0xffff
-; CHECK-SD-NEXT:    umnegl x0, w9, w8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umnegl_ldrb_h_commuted:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-GI-NEXT:    and x9, x1, #0xffff
-; CHECK-GI-NEXT:    mneg x0, x9, x8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umnegl_ldrb_h_commuted:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x9, x1, #0xffff
+; CHECK-NEXT:    umnegl x0, w9, w8
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i8, ptr %x0
   %zext = zext i8 %ext64 to i64
@@ -1664,18 +1544,11 @@ entry:
 }
 
 define i64 @umnegl_ldrh_w(ptr %x0, i32 %x1) {
-; CHECK-SD-LABEL: umnegl_ldrh_w:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrh w8, [x0]
-; CHECK-SD-NEXT:    umnegl x0, w8, w1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umnegl_ldrh_w:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrh w8, [x0]
-; CHECK-GI-NEXT:    mov w9, w1
-; CHECK-GI-NEXT:    mneg x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umnegl_ldrh_w:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    umnegl x0, w8, w1
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i16, ptr %x0
   %zext = zext i16 %ext64 to i64
@@ -1686,21 +1559,13 @@ entry:
 }
 
 define i64 @umnegl_ldr_b(ptr %x0, i8 %x1) {
-; CHECK-SD-LABEL: umnegl_ldr_b:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr w8, [x0]
-; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-SD-NEXT:    and x9, x1, #0xff
-; CHECK-SD-NEXT:    umnegl x0, w8, w9
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umnegl_ldr_b:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr w8, [x0]
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-GI-NEXT:    and x9, x1, #0xff
-; CHECK-GI-NEXT:    mneg x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umnegl_ldr_b:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x9, x1, #0xff
+; CHECK-NEXT:    umnegl x0, w8, w9
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i32, ptr %x0
   %zext = zext i32 %ext64 to i64
@@ -1711,18 +1576,11 @@ entry:
 }
 
 define i64 @umnegl_ldr2_w(ptr %x0, i32 %x1) {
-; CHECK-SD-LABEL: umnegl_ldr2_w:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr w8, [x0]
-; CHECK-SD-NEXT:    umnegl x0, w8, w1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umnegl_ldr2_w:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr w8, [x0]
-; CHECK-GI-NEXT:    mov w9, w1
-; CHECK-GI-NEXT:    mneg x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umnegl_ldr2_w:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    umnegl x0, w8, w1
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i64, ptr %x0
   %and = and i64 %ext64, 4294967295
@@ -1733,19 +1591,12 @@ entry:
 }
 
 define i64 @umnegl_ldr2_ldr2(ptr %x0, ptr %x1) {
-; CHECK-SD-LABEL: umnegl_ldr2_ldr2:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr w8, [x0]
-; CHECK-SD-NEXT:    ldr w9, [x1]
-; CHECK-SD-NEXT:    umnegl x0, w8, w9
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umnegl_ldr2_ldr2:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr w8, [x0]
-; CHECK-GI-NEXT:    ldr w9, [x1]
-; CHECK-GI-NEXT:    mneg x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umnegl_ldr2_ldr2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    umnegl x0, w8, w9
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i64, ptr %x0
   %and = and i64 %ext64, 4294967295
@@ -1757,18 +1608,11 @@ entry:
 }
 
 define i64 @umnegl_ldr2_d(ptr %x0, i64 %x1) {
-; CHECK-SD-LABEL: umnegl_ldr2_d:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr w8, [x0]
-; CHECK-SD-NEXT:    umnegl x0, w8, w1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umnegl_ldr2_d:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr w8, [x0]
-; CHECK-GI-NEXT:    mov w9, w1
-; CHECK-GI-NEXT:    mneg x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umnegl_ldr2_d:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    umnegl x0, w8, w1
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i64, ptr %x0
   %and = and i64 %ext64, 4294967295
@@ -1779,21 +1623,13 @@ entry:
 }
 
 define i64 @umsubl_ldrb_h(ptr %x0, i16 %x1, i64 %x2) {
-; CHECK-SD-LABEL: umsubl_ldrb_h:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-SD-NEXT:    and x9, x1, #0xffff
-; CHECK-SD-NEXT:    umsubl x0, w8, w9, x2
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umsubl_ldrb_h:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-GI-NEXT:    and x9, x1, #0xffff
-; CHECK-GI-NEXT:    msub x0, x8, x9, x2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umsubl_ldrb_h:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x9, x1, #0xffff
+; CHECK-NEXT:    umsubl x0, w8, w9, x2
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i8, ptr %x0
   %zext = zext i8 %ext64 to i64
@@ -1804,21 +1640,13 @@ entry:
 }
 
 define i64 @umsubl_ldrb_h_commuted(ptr %x0, i16 %x1, i64 %x2) {
-; CHECK-SD-LABEL: umsubl_ldrb_h_commuted:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-SD-NEXT:    and x9, x1, #0xffff
-; CHECK-SD-NEXT:    umsubl x0, w9, w8, x2
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umsubl_ldrb_h_commuted:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-GI-NEXT:    and x9, x1, #0xffff
-; CHECK-GI-NEXT:    msub x0, x9, x8, x2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umsubl_ldrb_h_commuted:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x9, x1, #0xffff
+; CHECK-NEXT:    umsubl x0, w9, w8, x2
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i8, ptr %x0
   %zext = zext i8 %ext64 to i64
@@ -1829,18 +1657,11 @@ entry:
 }
 
 define i64 @umsubl_ldrh_w(ptr %x0, i32 %x1, i64 %x2) {
-; CHECK-SD-LABEL: umsubl_ldrh_w:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrh w8, [x0]
-; CHECK-SD-NEXT:    umsubl x0, w8, w1, x2
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umsubl_ldrh_w:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrh w8, [x0]
-; CHECK-GI-NEXT:    mov w9, w1
-; CHECK-GI-NEXT:    msub x0, x8, x9, x2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umsubl_ldrh_w:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    umsubl x0, w8, w1, x2
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i16, ptr %x0
   %zext = zext i16 %ext64 to i64
@@ -1851,21 +1672,13 @@ entry:
 }
 
 define i64 @umsubl_ldr_b(ptr %x0, i8 %x1, i64 %x2) {
-; CHECK-SD-LABEL: umsubl_ldr_b:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr w8, [x0]
-; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-SD-NEXT:    and x9, x1, #0xff
-; CHECK-SD-NEXT:    umsubl x0, w8, w9, x2
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umsubl_ldr_b:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr w8, [x0]
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-GI-NEXT:    and x9, x1, #0xff
-; CHECK-GI-NEXT:    msub x0, x8, x9, x2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umsubl_ldr_b:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x9, x1, #0xff
+; CHECK-NEXT:    umsubl x0, w8, w9, x2
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i32, ptr %x0
   %zext = zext i32 %ext64 to i64
@@ -1876,18 +1689,11 @@ entry:
 }
 
 define i64 @umsubl_ldr2_w(ptr %x0, i32 %x1, i64 %x2) {
-; CHECK-SD-LABEL: umsubl_ldr2_w:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr w8, [x0]
-; CHECK-SD-NEXT:    umsubl x0, w8, w1, x2
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umsubl_ldr2_w:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr w8, [x0]
-; CHECK-GI-NEXT:    mov w9, w1
-; CHECK-GI-NEXT:    msub x0, x8, x9, x2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umsubl_ldr2_w:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    umsubl x0, w8, w1, x2
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i64, ptr %x0
   %and = and i64 %ext64, 4294967295
@@ -1898,19 +1704,12 @@ entry:
 }
 
 define i64 @umsubl_ldr2_ldr2(ptr %x0, ptr %x1, i64 %x2) {
-; CHECK-SD-LABEL: umsubl_ldr2_ldr2:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr w8, [x0]
-; CHECK-SD-NEXT:    ldr w9, [x1]
-; CHECK-SD-NEXT:    umsubl x0, w8, w9, x2
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umsubl_ldr2_ldr2:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr w8, [x0]
-; CHECK-GI-NEXT:    ldr w9, [x1]
-; CHECK-GI-NEXT:    msub x0, x8, x9, x2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umsubl_ldr2_ldr2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    umsubl x0, w8, w9, x2
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i64, ptr %x0
   %and = and i64 %ext64, 4294967295
@@ -1922,18 +1721,11 @@ entry:
 }
 
 define i64 @umsubl_ldr2_d(ptr %x0, i64 %x1, i64 %x2) {
-; CHECK-SD-LABEL: umsubl_ldr2_d:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr w8, [x0]
-; CHECK-SD-NEXT:    umsubl x0, w8, w1, x2
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umsubl_ldr2_d:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr w8, [x0]
-; CHECK-GI-NEXT:    mov w9, w1
-; CHECK-GI-NEXT:    msub x0, x8, x9, x2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umsubl_ldr2_d:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    umsubl x0, w8, w1, x2
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i64, ptr %x0
   %and = and i64 %ext64, 4294967295
@@ -1944,20 +1736,12 @@ entry:
 }
 
 define i64 @umull_ldr2_w_cc1(ptr %x0, i32 %x1) {
-; CHECK-SD-LABEL: umull_ldr2_w_cc1:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr x8, [x0]
-; CHECK-SD-NEXT:    and x8, x8, #0x7fffffff
-; CHECK-SD-NEXT:    umull x0, w8, w1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umull_ldr2_w_cc1:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr x8, [x0]
-; CHECK-GI-NEXT:    mov w9, w1
-; CHECK-GI-NEXT:    and x8, x8, #0x7fffffff
-; CHECK-GI-NEXT:    mul x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umull_ldr2_w_cc1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    and x8, x8, #0x7fffffff
+; CHECK-NEXT:    umull x0, w8, w1
+; CHECK-NEXT:    ret
 entry:
   %ext64 = load i64, ptr %x0
   %and = and i64 %ext64, 2147483647
@@ -1998,18 +1782,11 @@ entry:
 }
 
 define i64 @umull_and_lshr(i64 %x) {
-; CHECK-SD-LABEL: umull_and_lshr:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    lsr x8, x0, #32
-; CHECK-SD-NEXT:    umull x0, w0, w8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umull_and_lshr:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    lsr x8, x0, #32
-; CHECK-GI-NEXT:    mov w9, w0
-; CHECK-GI-NEXT:    mul x0, x9, x8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umull_and_lshr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr x8, x0, #32
+; CHECK-NEXT:    umull x0, w0, w8
+; CHECK-NEXT:    ret
     %lo = and i64 %x, u0xffffffff
     %hi = lshr i64 %x, 32
     %mul = mul i64 %lo, %hi
@@ -2028,18 +1805,11 @@ define i64 @umull_and_and(i64 %x, i64 %y) {
 }
 
 define i64 @umaddl_and_lshr(i64 %x, i64 %a) {
-; CHECK-SD-LABEL: umaddl_and_lshr:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    lsr x8, x0, #32
-; CHECK-SD-NEXT:    umaddl x0, w0, w8, x1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umaddl_and_lshr:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    lsr x8, x0, #32
-; CHECK-GI-NEXT:    mov w9, w0
-; CHECK-GI-NEXT:    madd x0, x9, x8, x1
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umaddl_and_lshr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr x8, x0, #32
+; CHECK-NEXT:    umaddl x0, w0, w8, x1
+; CHECK-NEXT:    ret
     %lo = and i64 %x, u0xffffffff
     %hi = lshr i64 %x, 32
     %mul = mul i64 %lo, %hi
@@ -2048,16 +1818,10 @@ define i64 @umaddl_and_lshr(i64 %x, i64 %a) {
 }
 
 define i64 @umaddl_and_and(i64 %x, i64 %y, i64 %a) {
-; CHECK-SD-LABEL: umaddl_and_and:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    umaddl x0, w0, w1, x2
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umaddl_and_and:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    umull x8, w0, w1
-; CHECK-GI-NEXT:    add x0, x2, x8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umaddl_and_and:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umaddl x0, w0, w1, x2
+; CHECK-NEXT:    ret
     %lo = and i64 %x, u0xffffffff
     %hi = and i64 %y, u0xffffffff
     %mul = mul i64 %lo, %hi
diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll
index 0f56d25..470d68a 100644
--- a/llvm/test/CodeGen/AArch64/abs.ll
+++ b/llvm/test/CodeGen/AArch64/abs.ll
@@ -243,6 +243,7 @@ define <1 x i32> @abs_v1i32(<1 x i32> %a){
 ;
 ; CHECK-GI-LABEL: abs_v1i32:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s0
 ; CHECK-GI-NEXT:    cmp w8, #0
diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
index c279cf0..49fb6c9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -401,16 +401,10 @@ define <4 x i16> @test_build_illegal(<4 x i32> %in) {
 ; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing
 ; the formation of an indexed-by-7 MLS.
 define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
-; CHECK-SD-LABEL: test_high_splat:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mls.4h v0, v1, v2[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_high_splat:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup.8h v2, v2[7]
-; CHECK-GI-NEXT:    mls.4h v0, v2, v1
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_high_splat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls.4h v0, v1, v2[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %b
diff --git a/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll b/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
index 0f08556..69b3010 100644
--- a/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
@@ -174,7 +174,7 @@ define <2 x i16> @test_varidx_extract_v4s16(<4 x i16> %x, i32 %idx) {
 ; CHECK-GISEL-NEXT:    and x9, x9, #0x3
 ; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GISEL-NEXT:    str d0, [sp, #8]
-; CHECK-GISEL-NEXT:    madd x8, x9, x8, x10
+; CHECK-GISEL-NEXT:    umaddl x8, w9, w8, x10
 ; CHECK-GISEL-NEXT:    umov w9, v0.h[1]
 ; CHECK-GISEL-NEXT:    ld1 { v0.h }[0], [x8]
 ; CHECK-GISEL-NEXT:    mov v0.s[1], w9
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
index c3ad3b4..85d8b7c 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -159,16 +159,10 @@ entry:
 }
 
 define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmla_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mla v0.4h, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmla_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    mla v0.4h, v2.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmla_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mla v0.4h, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %b
@@ -189,16 +183,10 @@ entry:
 }
 
 define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmla_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mla v0.2s, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmla_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    mla v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmla_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mla v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %b
@@ -271,16 +259,10 @@ entry:
 }
 
 define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmls_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mls v0.4h, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmls_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    mls v0.4h, v2.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmls_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls v0.4h, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %b
@@ -301,16 +283,10 @@ entry:
 }
 
 define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmls_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmls_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmls_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %b
@@ -427,16 +403,10 @@ entry:
 }
 
 define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.4h, v0.4h, v1.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-GI-NEXT:    mul v0.4h, v1.4h, v0.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %a
@@ -455,16 +425,10 @@ entry:
 }
 
 define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v1.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    mul v0.2s, v1.2s, v0.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %a
@@ -483,16 +447,10 @@ entry:
 }
 
 define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_u16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.4h, v0.4h, v1.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_u16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-GI-NEXT:    mul v0.4h, v1.4h, v0.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %a
@@ -511,16 +469,10 @@ entry:
 }
 
 define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_u32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v1.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_u32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    mul v0.2s, v1.2s, v0.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %a
@@ -567,16 +519,10 @@ entry:
 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
 
 define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK-SD-LABEL: test_vfma_laneq_f32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmla v0.2s, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vfma_laneq_f32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    fmla v0.2s, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vfma_laneq_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -834,16 +780,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlal v0.4s, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    smlal v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlal v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -852,16 +792,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlal v0.2d, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -920,8 +854,7 @@ define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_s16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    smlal v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    smlal v0.4s, v1.4h, v2.h[7]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -940,8 +873,7 @@ define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_s32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v2.s[3]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -978,16 +910,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlsl v0.4s, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    smlsl v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlsl v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -996,16 +922,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlsl v0.2d, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    smlsl v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlsl v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1064,8 +984,7 @@ define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_s16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    smlsl v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    smlsl v0.4s, v1.4h, v2.h[7]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1084,8 +1003,7 @@ define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_s32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    smlsl v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    smlsl v0.2d, v1.2s, v2.s[3]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -1122,16 +1040,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_u16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlal v0.4s, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_u16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    umlal v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlal v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1140,16 +1052,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_u32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlal v0.2d, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_u32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    umlal v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlal v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1208,8 +1114,7 @@ define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_u16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    umlal v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    umlal v0.4s, v1.4h, v2.h[7]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1228,8 +1133,7 @@ define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_u32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    umlal v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    umlal v0.2d, v1.2s, v2.s[3]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -1266,16 +1170,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_u16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlsl v0.4s, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_u16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    umlsl v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlsl v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1284,16 +1182,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_u32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlsl v0.2d, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_u32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    umlsl v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlsl v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1352,8 +1244,7 @@ define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_u16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    umlsl v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    umlsl v0.4s, v1.4h, v2.h[7]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1372,8 +1263,7 @@ define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_u32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    umlsl v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    umlsl v0.2d, v1.2s, v2.s[3]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -1512,16 +1402,10 @@ entry:
 }
 
 define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmull_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smull v0.4s, v0.4h, v1.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmull_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-GI-NEXT:    smull v0.4s, v0.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmull_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smull v0.4s, v0.4h, v1.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1529,16 +1413,10 @@ entry:
 }
 
 define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmull_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smull v0.2d, v0.2s, v1.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmull_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    smull v0.2d, v0.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmull_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smull v0.2d, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1546,16 +1424,10 @@ entry:
 }
 
 define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmull_laneq_u16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umull v0.4s, v0.4h, v1.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmull_laneq_u16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-GI-NEXT:    umull v0.4s, v0.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmull_laneq_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umull v0.4s, v0.4h, v1.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1563,16 +1435,10 @@ entry:
 }
 
 define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmull_laneq_u32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umull v0.2d, v0.2s, v1.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmull_laneq_u32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    umull v0.2d, v0.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmull_laneq_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umull v0.2d, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1588,8 +1454,7 @@ define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-GI-LABEL: test_vmull_high_laneq_s16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-GI-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    smull v0.4s, v0.4h, v1.h[7]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1607,8 +1472,7 @@ define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-GI-LABEL: test_vmull_high_laneq_s32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    smull v0.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    smull v0.2d, v0.2s, v1.s[3]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -1626,8 +1490,7 @@ define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-GI-LABEL: test_vmull_high_laneq_u16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-GI-NEXT:    umull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    umull v0.4s, v0.4h, v1.h[7]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1645,8 +1508,7 @@ define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-GI-LABEL: test_vmull_high_laneq_u32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    umull v0.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    umull v0.2d, v0.2s, v1.s[3]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -1816,16 +1678,10 @@ entry:
 }
 
 define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vqdmull_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sqdmull v0.4s, v0.4h, v1.h[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqdmull_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[3]
-; CHECK-GI-NEXT:    sqdmull v0.4s, v0.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqdmull_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmull v0.4s, v0.4h, v1.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1833,16 +1689,10 @@ entry:
 }
 
 define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vqdmull_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sqdmull v0.2d, v0.2s, v1.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqdmull_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    sqdmull v0.2d, v0.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqdmull_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmull v0.2d, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1898,8 +1748,7 @@ define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-GI-LABEL: test_vqdmull_high_laneq_s16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-GI-NEXT:    sqdmull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    sqdmull v0.4s, v0.4h, v1.h[7]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1917,8 +1766,7 @@ define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-GI-LABEL: test_vqdmull_high_laneq_s32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    sqdmull v0.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    sqdmull v0.2d, v0.2s, v1.s[3]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -2322,16 +2170,10 @@ entry:
 }
 
 define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_f32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v1.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_f32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    fmul v0.2s, v1.2s, v0.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   %mul = fmul <2 x float> %shuffle, %a
@@ -2553,16 +2395,10 @@ entry:
 }
 
 define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {
-; CHECK-SD-LABEL: test_vmulx_laneq_f32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmulx v0.2s, v0.2s, v1.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmulx_laneq_f32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    fmulx v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmulx_laneq_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmulx v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -2657,16 +2493,10 @@ entry:
 }
 
 define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmla_laneq_s16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mla v0.4h, v1.4h, v2.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmla_laneq_s16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-GI-NEXT:    mla v0.4h, v2.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmla_laneq_s16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mla v0.4h, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %b
@@ -2687,16 +2517,10 @@ entry:
 }
 
 define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmla_laneq_s32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mla v0.2s, v1.2s, v2.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmla_laneq_s32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-GI-NEXT:    mla v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmla_laneq_s32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mla v0.2s, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %b
@@ -2769,16 +2593,10 @@ entry:
 }
 
 define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmls_laneq_s16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mls v0.4h, v1.4h, v2.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmls_laneq_s16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-GI-NEXT:    mls v0.4h, v2.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmls_laneq_s16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls v0.4h, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %b
@@ -2799,16 +2617,10 @@ entry:
 }
 
 define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmls_laneq_s32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmls_laneq_s32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmls_laneq_s32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls v0.2s, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %b
@@ -2925,16 +2737,10 @@ entry:
 }
 
 define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_s16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.4h, v0.4h, v1.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_s16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[0]
-; CHECK-GI-NEXT:    mul v0.4h, v1.4h, v0.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_s16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %a
@@ -2953,16 +2759,10 @@ entry:
 }
 
 define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_s32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v1.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_s32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[0]
-; CHECK-GI-NEXT:    mul v0.2s, v1.2s, v0.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_s32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %a
@@ -2981,16 +2781,10 @@ entry:
 }
 
 define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_u16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.4h, v0.4h, v1.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_u16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[0]
-; CHECK-GI-NEXT:    mul v0.4h, v1.4h, v0.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_u16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %a
@@ -3009,16 +2803,10 @@ entry:
 }
 
 define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_u32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v1.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_u32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[0]
-; CHECK-GI-NEXT:    mul v0.2s, v1.2s, v0.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_u32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %a
@@ -3061,16 +2849,10 @@ entry:
 }
 
 define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK-SD-LABEL: test_vfma_laneq_f32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmla v0.2s, v1.2s, v2.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vfma_laneq_f32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-GI-NEXT:    fmla v0.2s, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vfma_laneq_f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla v0.2s, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -3188,16 +2970,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_s16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlal v0.4s, v1.4h, v2.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_s16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-GI-NEXT:    smlal v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_s16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlal v0.4s, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -3206,16 +2982,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_s32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlal v0.2d, v1.2s, v2.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_s32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_s32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -3274,8 +3044,7 @@ define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_s16_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-GI-NEXT:    smlal v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    smlal v0.4s, v1.4h, v2.h[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -3294,8 +3063,7 @@ define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_s32_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v2.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -3332,16 +3100,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_s16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlsl v0.4s, v1.4h, v2.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_s16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-GI-NEXT:    smlsl v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_s16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlsl v0.4s, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -3350,16 +3112,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_s32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlsl v0.2d, v1.2s, v2.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_s32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-GI-NEXT:    smlsl v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_s32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlsl v0.2d, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -3418,8 +3174,7 @@ define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_s16_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-GI-NEXT:    smlsl v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    smlsl v0.4s, v1.4h, v2.h[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -3438,8 +3193,7 @@ define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_s32_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-GI-NEXT:    smlsl v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    smlsl v0.2d, v1.2s, v2.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -3476,16 +3230,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_u16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlal v0.4s, v1.4h, v2.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_u16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-GI-NEXT:    umlal v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_u16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlal v0.4s, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -3494,16 +3242,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_u32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlal v0.2d, v1.2s, v2.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_u32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-GI-NEXT:    umlal v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_u32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlal v0.2d, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -3562,8 +3304,7 @@ define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_u16_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-GI-NEXT:    umlal v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    umlal v0.4s, v1.4h, v2.h[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -3582,8 +3323,7 @@ define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_u32_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-GI-NEXT:    umlal v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    umlal v0.2d, v1.2s, v2.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -3620,16 +3360,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_u16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlsl v0.4s, v1.4h, v2.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_u16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-GI-NEXT:    umlsl v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_u16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlsl v0.4s, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -3638,16 +3372,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_u32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlsl v0.2d, v1.2s, v2.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_u32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-GI-NEXT:    umlsl v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_u32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlsl v0.2d, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -3706,8 +3434,7 @@ define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_u16_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-GI-NEXT:    umlsl v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    umlsl v0.4s, v1.4h, v2.h[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -3726,8 +3453,7 @@ define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_u32_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-GI-NEXT:    umlsl v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    umlsl v0.2d, v1.2s, v2.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -3866,16 +3592,10 @@ entry:
 }
 
 define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmull_laneq_s16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smull v0.4s, v0.4h, v1.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmull_laneq_s16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[0]
-; CHECK-GI-NEXT:    smull v0.4s, v0.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmull_laneq_s16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smull v0.4s, v0.4h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -3883,16 +3603,10 @@ entry:
 }
 
 define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmull_laneq_s32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smull v0.2d, v0.2s, v1.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmull_laneq_s32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[0]
-; CHECK-GI-NEXT:    smull v0.2d, v0.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmull_laneq_s32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smull v0.2d, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -3900,16 +3614,10 @@ entry:
 }
 
 define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmull_laneq_u16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umull v0.4s, v0.4h, v1.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmull_laneq_u16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[0]
-; CHECK-GI-NEXT:    umull v0.4s, v0.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmull_laneq_u16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umull v0.4s, v0.4h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -3917,16 +3625,10 @@ entry:
 }
 
 define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmull_laneq_u32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umull v0.2d, v0.2s, v1.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmull_laneq_u32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[0]
-; CHECK-GI-NEXT:    umull v0.2d, v0.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmull_laneq_u32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umull v0.2d, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -3942,8 +3644,7 @@ define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-GI-LABEL: test_vmull_high_laneq_s16_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[0]
-; CHECK-GI-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    smull v0.4s, v0.4h, v1.h[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -3961,8 +3662,7 @@ define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-GI-LABEL: test_vmull_high_laneq_s32_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[0]
-; CHECK-GI-NEXT:    smull v0.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    smull v0.2d, v0.2s, v1.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -3980,8 +3680,7 @@ define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-GI-LABEL: test_vmull_high_laneq_u16_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[0]
-; CHECK-GI-NEXT:    umull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    umull v0.4s, v0.4h, v1.h[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -3999,8 +3698,7 @@ define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-GI-LABEL: test_vmull_high_laneq_u32_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[0]
-; CHECK-GI-NEXT:    umull v0.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    umull v0.2d, v0.2s, v1.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -4170,16 +3868,10 @@ entry:
 }
 
 define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vqdmull_laneq_s16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sqdmull v0.4s, v0.4h, v1.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqdmull_laneq_s16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[0]
-; CHECK-GI-NEXT:    sqdmull v0.4s, v0.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqdmull_laneq_s16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmull v0.4s, v0.4h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -4187,16 +3879,10 @@ entry:
 }
 
 define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vqdmull_laneq_s32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sqdmull v0.2d, v0.2s, v1.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqdmull_laneq_s32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[0]
-; CHECK-GI-NEXT:    sqdmull v0.2d, v0.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqdmull_laneq_s32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmull v0.2d, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -4252,8 +3938,7 @@ define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-GI-LABEL: test_vqdmull_high_laneq_s16_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[0]
-; CHECK-GI-NEXT:    sqdmull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    sqdmull v0.4s, v0.4h, v1.h[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -4271,8 +3956,7 @@ define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-GI-LABEL: test_vqdmull_high_laneq_s32_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[0]
-; CHECK-GI-NEXT:    sqdmull v0.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    sqdmull v0.2d, v0.2s, v1.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -4402,16 +4086,10 @@ entry:
 }
 
 define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_f32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v1.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_f32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[0]
-; CHECK-GI-NEXT:    fmul v0.2s, v1.2s, v0.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   %mul = fmul <2 x float> %shuffle, %a
@@ -4498,16 +4176,10 @@ entry:
 }
 
 define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
-; CHECK-SD-LABEL: test_vmulx_laneq_f32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmulx v0.2s, v0.2s, v1.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmulx_laneq_f32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[0]
-; CHECK-GI-NEXT:    fmulx v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmulx_laneq_f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmulx v0.2s, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll b/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll
index 17fb312..0ede4bc7 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll
@@ -139,7 +139,7 @@ define i32 @addp_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    addp v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT:    rev64 v1.2s, v0.2s
+; CHECK-GI-NEXT:    dup v1.2s, v0.s[1]
 ; CHECK-GI-NEXT:    add v0.2s, v0.2s, v1.2s
 ; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index ddd8a72..60af49d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -807,46 +807,28 @@ define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %v1) #0 {
 }
 
 define <8 x i8> @test_vdup_laneq_s8(<16 x i8> %v1) #0 {
-; CHECK-SD-LABEL: test_vdup_laneq_s8:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    dup v0.8b, v0.b[5]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vdup_laneq_s8:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    dup v0.16b, v0.b[5]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vdup_laneq_s8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.8b, v0.b[5]
+; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i8> %shuffle
 }
 
 define <4 x i16> @test_vdup_laneq_s16(<8 x i16> %v1) #0 {
-; CHECK-SD-LABEL: test_vdup_laneq_s16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    dup v0.4h, v0.h[2]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vdup_laneq_s16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    dup v0.8h, v0.h[2]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vdup_laneq_s16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.4h, v0.h[2]
+; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   ret <4 x i16> %shuffle
 }
 
 define <2 x i32> @test_vdup_laneq_s32(<4 x i32> %v1) #0 {
-; CHECK-SD-LABEL: test_vdup_laneq_s32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    dup v0.2s, v0.s[1]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vdup_laneq_s32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    dup v0.4s, v0.s[1]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vdup_laneq_s32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.2s, v0.s[1]
+; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
   ret <2 x i32> %shuffle
 }
@@ -1233,6 +1215,7 @@ define <8 x i8> @testDUP.v1i8(<1 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: testDUP.v1i8:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    dup v0.8b, w8
 ; CHECK-GI-NEXT:    ret
@@ -1728,7 +1711,7 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 {
 ; CHECK-GI-NEXT:    mov v2.16b, v1.16b
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI127_0
-; CHECK-GI-NEXT:    mov v1.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b1, v0.b[0]
 ; CHECK-GI-NEXT:    mov v1.b[1], v0.b[1]
 ; CHECK-GI-NEXT:    mov v1.b[2], v0.b[2]
 ; CHECK-GI-NEXT:    mov v1.b[3], v0.b[3]
@@ -1835,7 +1818,7 @@ define <16 x i8> @test_concat_v16i8_v8i8_v8i8(<8 x i8> %x, <8 x i8> %y) #0 {
 ; CHECK-GI-LABEL: test_concat_v16i8_v8i8_v8i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[0]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    mov v2.b[1], v0.b[1]
 ; CHECK-GI-NEXT:    mov v2.b[2], v0.b[2]
@@ -1921,7 +1904,7 @@ define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 {
 ; CHECK-GI-NEXT:    mov v2.16b, v1.16b
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI131_0
-; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov h1, v0.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
 ; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
@@ -1992,7 +1975,7 @@ define <8 x i16> @test_concat_v8i16_v4i16_v4i16(<4 x i16> %x, <4 x i16> %y) #0 {
 ; CHECK-GI-LABEL: test_concat_v8i16_v4i16_v4i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov h2, v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    mov v2.h[1], v0.h[1]
 ; CHECK-GI-NEXT:    mov v2.h[2], v0.h[2]
@@ -2054,7 +2037,7 @@ define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 {
 ; CHECK-GI-NEXT:    mov v2.16b, v1.16b
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI135_0
-; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov s1, v0.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI135_0]
 ; CHECK-GI-NEXT:    tbl v0.16b, { v1.16b, v2.16b }, v0.16b
@@ -2260,6 +2243,7 @@ define <8 x i8> @concat_vector_v8i8(<1 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: concat_vector_v8i8:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    dup v0.8b, w8
 ; CHECK-GI-NEXT:    ret
@@ -2286,6 +2270,7 @@ define <16 x i8> @concat_vector_v16i8(<1 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: concat_vector_v16i8:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    dup v0.16b, w8
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
index f47c06e..ac6f041 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
@@ -614,16 +614,11 @@ entry:
 }
 
 define void @test_vst1_lane0_s16(ptr %a, <4 x i16> %b) {
-; CHECK-GI-LABEL: test_vst1_lane0_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    str h0, [x0]
-; CHECK-GI-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_vst1_lane0_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    str h0, [x0]
-; CHECK-SD-NEXT:    ret
+; CHECK-LABEL: test_vst1_lane0_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    str h0, [x0]
+; CHECK-NEXT:    ret
 entry:
   %0 = extractelement <4 x i16> %b, i32 0
   store i16 %0, ptr %a, align 2
@@ -643,16 +638,11 @@ entry:
 }
 
 define void @test_vst1_lane0_s32(ptr %a, <2 x i32> %b) {
-; CHECK-GI-LABEL: test_vst1_lane0_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    str s0, [x0]
-; CHECK-GI-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_vst1_lane0_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    str s0, [x0]
-; CHECK-SD-NEXT:    ret
+; CHECK-LABEL: test_vst1_lane0_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    str s0, [x0]
+; CHECK-NEXT:    ret
 entry:
   %0 = extractelement <2 x i32> %b, i32 0
   store i32 %0, ptr %a, align 4
@@ -683,16 +673,11 @@ entry:
 }
 
 define void @test_vst1_lane0_f32(ptr %a, <2 x float> %b) {
-; CHECK-GI-LABEL: test_vst1_lane0_f32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    str s0, [x0]
-; CHECK-GI-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_vst1_lane0_f32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    str s0, [x0]
-; CHECK-SD-NEXT:    ret
+; CHECK-LABEL: test_vst1_lane0_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    str s0, [x0]
+; CHECK-NEXT:    ret
 entry:
   %0 = extractelement <2 x float> %b, i32 0
   store float %0, ptr %a, align 4
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll b/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
index 7b439dd..1f8ac79 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
@@ -569,16 +569,10 @@ define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
 ; Using sqrdmlah intrinsics
 
 define <4 x i16> @test_vqrdmlah_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vqrdmlah_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sqrdmlah v0.4h, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqrdmlah_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    sqrdmlah v0.4h, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqrdmlah_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmlah v0.4h, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %lane = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vqrdmlah_v3.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %lane) #4
@@ -586,16 +580,10 @@ entry:
 }
 
 define <2 x i32> @test_vqrdmlah_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vqrdmlah_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sqrdmlah v0.2s, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqrdmlah_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    sqrdmlah v0.2s, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqrdmlah_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmlah v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %lane = shufflevector <4 x i32> %v, <4 x i32> poison, <2 x i32> <i32 3, i32 3>
   %vqrdmlah_v3.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %lane) #4
@@ -657,23 +645,14 @@ entry:
 }
 
 define i16 @test_vqrdmlahh_lane_s16(i16 %a, i16 %b, <4 x i16> %c) {
-; CHECK-SD-LABEL: test_vqrdmlahh_lane_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmov s1, w0
-; CHECK-SD-NEXT:    fmov s2, w1
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    sqrdmlah v1.4h, v2.4h, v0.h[3]
-; CHECK-SD-NEXT:    umov w0, v1.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqrdmlahh_lane_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    rev64 v0.4h, v0.4h
-; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    fmov s2, w1
-; CHECK-GI-NEXT:    sqrdmlah v1.4h, v2.4h, v0.4h
-; CHECK-GI-NEXT:    umov w0, v1.h[0]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqrdmlahh_lane_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s2, w1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    sqrdmlah v1.4h, v2.4h, v0.h[3]
+; CHECK-NEXT:    umov w0, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %0 = insertelement <4 x i16> undef, i16 %a, i64 0
   %1 = insertelement <4 x i16> undef, i16 %b, i64 0
@@ -684,24 +663,14 @@ entry:
 }
 
 define i32 @test_vqrdmlahs_lane_s32(i32 %a, i32 %b, <2 x i32> %c) {
-; CHECK-SD-LABEL: test_vqrdmlahs_lane_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmov s1, w0
-; CHECK-SD-NEXT:    fmov s2, w1
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    sqrdmlah s1, s2, v0.s[1]
-; CHECK-SD-NEXT:    fmov w0, s1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqrdmlahs_lane_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    fmov s2, w1
-; CHECK-GI-NEXT:    mov s0, v0.s[1]
-; CHECK-GI-NEXT:    sqrdmlah s1, s2, s0
-; CHECK-GI-NEXT:    fmov w0, s1
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqrdmlahs_lane_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s2, w1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    sqrdmlah s1, s2, v0.s[1]
+; CHECK-NEXT:    fmov w0, s1
+; CHECK-NEXT:    ret
 entry:
   %vget_lane = extractelement <2 x i32> %c, i64 1
   %vqrdmlahs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 %a, i32 %b, i32 %vget_lane) #4
@@ -709,22 +678,13 @@ entry:
 }
 
 define i16 @test_vqrdmlahh_laneq_s16(i16 %a, i16 %b, <8 x i16> %c) {
-; CHECK-SD-LABEL: test_vqrdmlahh_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmov s1, w0
-; CHECK-SD-NEXT:    fmov s2, w1
-; CHECK-SD-NEXT:    sqrdmlah v1.4h, v2.4h, v0.h[7]
-; CHECK-SD-NEXT:    umov w0, v1.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqrdmlahh_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ext v0.16b, v0.16b, v0.16b, #14
-; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    fmov s2, w1
-; CHECK-GI-NEXT:    sqrdmlah v1.4h, v2.4h, v0.4h
-; CHECK-GI-NEXT:    umov w0, v1.h[0]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqrdmlahh_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s2, w1
+; CHECK-NEXT:    sqrdmlah v1.4h, v2.4h, v0.h[7]
+; CHECK-NEXT:    umov w0, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %0 = insertelement <4 x i16> undef, i16 %a, i64 0
   %1 = insertelement <4 x i16> undef, i16 %b, i64 0
@@ -749,16 +709,10 @@ entry:
 }
 
 define <4 x i16> @test_vqrdmlsh_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vqrdmlsh_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sqrdmlsh v0.4h, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqrdmlsh_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    sqrdmlsh v0.4h, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqrdmlsh_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmlsh v0.4h, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %lane = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vqrdmlsh_v3.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %lane) #4
@@ -766,16 +720,10 @@ entry:
 }
 
 define <2 x i32> @test_vqrdmlsh_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vqrdmlsh_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sqrdmlsh v0.2s, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqrdmlsh_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    sqrdmlsh v0.2s, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqrdmlsh_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmlsh v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %lane = shufflevector <4 x i32> %v, <4 x i32> poison, <2 x i32> <i32 3, i32 3>
   %vqrdmlsh_v3.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %lane) #4
@@ -837,23 +785,14 @@ entry:
 }
 
 define i16 @test_vqrdmlshh_lane_s16(i16 %a, i16 %b, <4 x i16> %c) {
-; CHECK-SD-LABEL: test_vqrdmlshh_lane_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmov s1, w0
-; CHECK-SD-NEXT:    fmov s2, w1
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    sqrdmlsh v1.4h, v2.4h, v0.h[3]
-; CHECK-SD-NEXT:    umov w0, v1.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqrdmlshh_lane_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    rev64 v0.4h, v0.4h
-; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    fmov s2, w1
-; CHECK-GI-NEXT:    sqrdmlsh v1.4h, v2.4h, v0.4h
-; CHECK-GI-NEXT:    umov w0, v1.h[0]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqrdmlshh_lane_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s2, w1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    sqrdmlsh v1.4h, v2.4h, v0.h[3]
+; CHECK-NEXT:    umov w0, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %0 = insertelement <4 x i16> undef, i16 %a, i64 0
   %1 = insertelement <4 x i16> undef, i16 %b, i64 0
@@ -864,24 +803,14 @@ entry:
 }
 
 define i32 @test_vqrdmlshs_lane_s32(i32 %a, i32 %b, <2 x i32> %c) {
-; CHECK-SD-LABEL: test_vqrdmlshs_lane_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmov s1, w0
-; CHECK-SD-NEXT:    fmov s2, w1
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    sqrdmlsh s1, s2, v0.s[1]
-; CHECK-SD-NEXT:    fmov w0, s1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqrdmlshs_lane_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    fmov s2, w1
-; CHECK-GI-NEXT:    mov s0, v0.s[1]
-; CHECK-GI-NEXT:    sqrdmlsh s1, s2, s0
-; CHECK-GI-NEXT:    fmov w0, s1
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqrdmlshs_lane_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s2, w1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    sqrdmlsh s1, s2, v0.s[1]
+; CHECK-NEXT:    fmov w0, s1
+; CHECK-NEXT:    ret
 entry:
   %vget_lane = extractelement <2 x i32> %c, i64 1
   %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %vget_lane) #4
@@ -889,22 +818,13 @@ entry:
 }
 
 define i16 @test_vqrdmlshh_laneq_s16(i16 %a, i16 %b, <8 x i16> %c) {
-; CHECK-SD-LABEL: test_vqrdmlshh_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmov s1, w0
-; CHECK-SD-NEXT:    fmov s2, w1
-; CHECK-SD-NEXT:    sqrdmlsh v1.4h, v2.4h, v0.h[7]
-; CHECK-SD-NEXT:    umov w0, v1.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqrdmlshh_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ext v0.16b, v0.16b, v0.16b, #14
-; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    fmov s2, w1
-; CHECK-GI-NEXT:    sqrdmlsh v1.4h, v2.4h, v0.4h
-; CHECK-GI-NEXT:    umov w0, v1.h[0]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqrdmlshh_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s2, w1
+; CHECK-NEXT:    sqrdmlsh v1.4h, v2.4h, v0.h[7]
+; CHECK-NEXT:    umov w0, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %0 = insertelement <4 x i16> undef, i16 %a, i64 0
   %1 = insertelement <4 x i16> undef, i16 %b, i64 0
@@ -927,3 +847,6 @@ entry:
   %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %vgetq_lane) #4
   ret i32 %vqrdmlshs_s32.i
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-GI: {{.*}}
+; CHECK-SD: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/arm64-rev.ll b/llvm/test/CodeGen/AArch64/arm64-rev.ll
index 6bdd5f9..84557b4 100644
--- a/llvm/test/CodeGen/AArch64/arm64-rev.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-rev.ll
@@ -34,8 +34,7 @@ define i32 @test_rev_w_srl16(i16 %a) {
 ; CHECK-GI-LABEL: test_rev_w_srl16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    and w8, w0, #0xffff
-; CHECK-GI-NEXT:    rev w8, w8
-; CHECK-GI-NEXT:    lsr w0, w8, #16
+; CHECK-GI-NEXT:    rev16 w0, w8
 ; CHECK-GI-NEXT:    ret
 entry:
   %0 = zext i16 %a to i32
@@ -45,12 +44,18 @@ entry:
 }
 
 define i32 @test_rev_w_srl16_load(ptr %a) {
-; CHECK-LABEL: test_rev_w_srl16_load:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    lsr w0, w8, #16
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_rev_w_srl16_load:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldrh w8, [x0]
+; CHECK-SD-NEXT:    rev w8, w8
+; CHECK-SD-NEXT:    lsr w0, w8, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_rev_w_srl16_load:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldrh w8, [x0]
+; CHECK-GI-NEXT:    rev16 w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = load i16, ptr %a
   %1 = zext i16 %0 to i32
@@ -71,8 +76,7 @@ define i32 @test_rev_w_srl16_add(i8 %a, i8 %b) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    and w8, w1, #0xff
 ; CHECK-GI-NEXT:    add w8, w8, w0, uxtb
-; CHECK-GI-NEXT:    rev w8, w8
-; CHECK-GI-NEXT:    lsr w0, w8, #16
+; CHECK-GI-NEXT:    rev16 w0, w8
 ; CHECK-GI-NEXT:    ret
 entry:
   %0 = zext i8 %a to i32
@@ -96,8 +100,7 @@ define i64 @test_rev_x_srl32(i32 %a) {
 ; CHECK-GI-LABEL: test_rev_x_srl32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov w8, w0
-; CHECK-GI-NEXT:    rev x8, x8
-; CHECK-GI-NEXT:    lsr x0, x8, #32
+; CHECK-GI-NEXT:    rev32 x0, x8
 ; CHECK-GI-NEXT:    ret
 entry:
   %0 = zext i32 %a to i64
@@ -107,12 +110,18 @@ entry:
 }
 
 define i64 @test_rev_x_srl32_load(ptr %a) {
-; CHECK-LABEL: test_rev_x_srl32_load:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    lsr x0, x8, #32
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_rev_x_srl32_load:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldr w8, [x0]
+; CHECK-SD-NEXT:    rev x8, x8
+; CHECK-SD-NEXT:    lsr x0, x8, #32
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_rev_x_srl32_load:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    rev32 x0, x8
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = load i32, ptr %a
   %1 = zext i32 %0 to i64
@@ -122,18 +131,11 @@ entry:
 }
 
 define i64 @test_rev_x_srl32_shift(i64 %a) {
-; CHECK-SD-LABEL: test_rev_x_srl32_shift:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ubfx x8, x0, #2, #29
-; CHECK-SD-NEXT:    rev32 x0, x8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_rev_x_srl32_shift:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ubfx x8, x0, #2, #29
-; CHECK-GI-NEXT:    rev x8, x8
-; CHECK-GI-NEXT:    lsr x0, x8, #32
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_rev_x_srl32_shift:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ubfx x8, x0, #2, #29
+; CHECK-NEXT:    rev32 x0, x8
+; CHECK-NEXT:    ret
 entry:
   %0 = shl i64 %a, 33
   %1 = lshr i64 %0, 35
diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
index d4cc154..eccf918 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
@@ -271,6 +271,7 @@ define half @test_vcvt_f16_f32(<1 x float> %x) {
 ;
 ; GISEL-LABEL: test_vcvt_f16_f32:
 ; GISEL:       // %bb.0:
+; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; GISEL-NEXT:    fcvt h0, s0
 ; GISEL-NEXT:    ret
   %tmp = fptrunc <1 x float> %x to <1 x half>
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index 38cec0d..d2f72ec 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -15,7 +15,7 @@ define <4 x i16> @foo1(<2 x i32> %a) {
 ; CHECK-GI-NEXT:    mov w8, #58712 // =0xe558
 ; CHECK-GI-NEXT:    fmov s1, w8
 ; CHECK-GI-NEXT:    zip1 v0.2s, v1.2s, v0.2s
-; CHECK-GI-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-GI-NEXT:    dup v0.4h, v0.h[1]
 ; CHECK-GI-NEXT:    ret
   %1 = shufflevector <2 x i32> <i32 58712, i32 undef>, <2 x i32> %a, <2 x i32> <i32 0, i32 2>
 ; Can't optimize the following bitcast to scalar_to_vector.
@@ -35,7 +35,7 @@ define <4 x i16> @foo2(<2 x i32> %a) {
 ; CHECK-GI-NEXT:    mov w8, #712 // =0x2c8
 ; CHECK-GI-NEXT:    fmov s1, w8
 ; CHECK-GI-NEXT:    zip1 v0.2s, v1.2s, v0.2s
-; CHECK-GI-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-GI-NEXT:    dup v0.4h, v0.h[1]
 ; CHECK-GI-NEXT:    ret
   %1 = shufflevector <2 x i32> <i32 712, i32 undef>, <2 x i32> %a, <2 x i32> <i32 0, i32 2>
 ; Can't optimize the following bitcast to scalar_to_vector.
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index 898958f..9ae4782 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -207,6 +207,7 @@ define <1 x i32> @bswap_v1i32(<1 x i32> %a){
 ;
 ; CHECK-GI-LABEL: bswap_v1i32:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    rev w8, w8
 ; CHECK-GI-NEXT:    fmov s0, w8
diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll
index acf15f1..1e8dd0c 100644
--- a/llvm/test/CodeGen/AArch64/concat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/concat-vector.ll
@@ -13,11 +13,10 @@ define <4 x i8> @concat1(<2 x i8> %A, <2 x i8> %B) {
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov w8, v0.s[1]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov w9, v1.s[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], w8
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov v0.h[2], w8
-; CHECK-GI-NEXT:    mov v0.h[3], w9
+; CHECK-GI-NEXT:    mov w8, v1.s[1]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[3], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
    %v4i8 = shufflevector <2 x i8> %A, <2 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AArch64/double_reduct.ll b/llvm/test/CodeGen/AArch64/double_reduct.ll
index f30895d..2d146bf 100644
--- a/llvm/test/CodeGen/AArch64/double_reduct.ll
+++ b/llvm/test/CodeGen/AArch64/double_reduct.ll
@@ -65,10 +65,8 @@ define float @fmul_f32(<8 x float> %a, <4 x float> %b) {
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v2.2s, v3.2s
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov s3, v1.s[1]
-; CHECK-GI-NEXT:    fmul s0, s0, s2
-; CHECK-GI-NEXT:    fmul s1, s1, s3
+; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a)
@@ -92,10 +90,8 @@ define float @fmul_f32_same(<4 x float> %a, <4 x float> %b) {
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov s3, v1.s[1]
-; CHECK-GI-NEXT:    fmul s0, s0, s2
-; CHECK-GI-NEXT:    fmul s1, s1, s3
+; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
@@ -922,10 +918,8 @@ define float @nested_mul_f32(<4 x float> %a, <4 x float> %b, float %c, float %d)
 ; CHECK-GI-NEXT:    mov d5, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v4.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v5.2s
-; CHECK-GI-NEXT:    mov s4, v0.s[1]
-; CHECK-GI-NEXT:    mov s5, v1.s[1]
-; CHECK-GI-NEXT:    fmul s0, s0, s4
-; CHECK-GI-NEXT:    fmul s1, s1, s5
+; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
 ; CHECK-GI-NEXT:    fmul s0, s0, s2
 ; CHECK-GI-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
diff --git a/llvm/test/CodeGen/AArch64/dup.ll b/llvm/test/CodeGen/AArch64/dup.ll
index 26e070f..079ff10 100644
--- a/llvm/test/CodeGen/AArch64/dup.ll
+++ b/llvm/test/CodeGen/AArch64/dup.ll
@@ -1025,13 +1025,12 @@ define <3 x i64> @duplane0_v3i64(<3 x i64> %b) {
 ;
 ; CHECK-GI-LABEL: duplane0_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov d2, d0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov v2.d[1], v1.d[0]
-; CHECK-GI-NEXT:    dup v0.2d, v2.d[0]
-; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    dup v0.2d, v0.d[0]
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    fmov d2, d0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = shufflevector <3 x i64> %b, <3 x i64> poison, <3 x i32> zeroinitializer
@@ -2354,13 +2353,12 @@ define <3 x double> @duplane0_v3double(<3 x double> %b) {
 ;
 ; CHECK-GI-LABEL: duplane0_v3double:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov d2, d0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov v2.d[1], v1.d[0]
-; CHECK-GI-NEXT:    dup v0.2d, v2.d[0]
-; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    dup v0.2d, v0.d[0]
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    fmov d2, d0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = shufflevector <3 x double> %b, <3 x double> poison, <3 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll
index adc536d..aa120f2 100644
--- a/llvm/test/CodeGen/AArch64/f16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll
@@ -1496,7 +1496,7 @@ define half @test_copysign(half %a, half %b) #0 {
 ; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-GI-NEXT:    // kill: def $h1 killed $h1 def $d1
 ; CHECK-CVT-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
 ; CHECK-CVT-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: test_copysign:
@@ -1505,7 +1505,7 @@ define half @test_copysign(half %a, half %b) #0 {
 ; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-FP16-GI-NEXT:    // kill: def $h1 killed $h1 def $d1
 ; CHECK-FP16-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
 ; CHECK-FP16-GI-NEXT:    ret
   %r = call half @llvm.copysign.f16(half %a, half %b)
   ret half %r
@@ -1536,7 +1536,7 @@ define half @test_copysign_f32(half %a, float %b) #0 {
 ; CHECK-CVT-GI-NEXT:    mvni v2.4h, #128, lsl #8
 ; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
 ; CHECK-CVT-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: test_copysign_f32:
@@ -1545,7 +1545,7 @@ define half @test_copysign_f32(half %a, float %b) #0 {
 ; CHECK-FP16-GI-NEXT:    mvni v2.4h, #128, lsl #8
 ; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-FP16-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
 ; CHECK-FP16-GI-NEXT:    ret
   %tb = fptrunc float %b to half
   %r = call half @llvm.copysign.f16(half %a, half %tb)
@@ -1577,7 +1577,7 @@ define half @test_copysign_f64(half %a, double %b) #0 {
 ; CHECK-CVT-GI-NEXT:    mvni v2.4h, #128, lsl #8
 ; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
 ; CHECK-CVT-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: test_copysign_f64:
@@ -1586,7 +1586,7 @@ define half @test_copysign_f64(half %a, double %b) #0 {
 ; CHECK-FP16-GI-NEXT:    mvni v2.4h, #128, lsl #8
 ; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-FP16-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
 ; CHECK-FP16-GI-NEXT:    ret
   %tb = fptrunc double %b to half
   %r = call half @llvm.copysign.f16(half %a, half %tb)
diff --git a/llvm/test/CodeGen/AArch64/faddsub.ll b/llvm/test/CodeGen/AArch64/faddsub.ll
index b155791..943073e 100644
--- a/llvm/test/CodeGen/AArch64/faddsub.ll
+++ b/llvm/test/CodeGen/AArch64/faddsub.ll
@@ -196,7 +196,7 @@ define <7 x half> @fadd_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h0, v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
@@ -537,7 +537,7 @@ define <7 x half> @fsub_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h0, v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll
index 3a5f7e2..7ac1f37 100644
--- a/llvm/test/CodeGen/AArch64/fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/fcopysign.ll
@@ -33,7 +33,7 @@ define float @copysign_f32(float %a, float %b) {
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $d0
 ; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $d1
 ; CHECK-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call float @llvm.copysign.f32(float %a, float %b)
@@ -56,7 +56,7 @@ define half @copysign_f16(half %a, half %b) {
 ; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-GI-NEXT:    // kill: def $h1 killed $h1 def $d1
 ; CHECK-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call half @llvm.copysign.f16(half %a, half %b)
diff --git a/llvm/test/CodeGen/AArch64/fcvt.ll b/llvm/test/CodeGen/AArch64/fcvt.ll
index b408e9c..2c512de 100644
--- a/llvm/test/CodeGen/AArch64/fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt.ll
@@ -169,7 +169,7 @@ define <7 x half> @ceil_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frintp v2.4s, v2.4s
@@ -468,7 +468,7 @@ define <7 x half> @floor_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frintm v2.4s, v2.4s
@@ -767,7 +767,7 @@ define <7 x half> @nearbyint_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frinti v2.4s, v2.4s
@@ -1066,7 +1066,7 @@ define <7 x half> @roundeven_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frintn v2.4s, v2.4s
@@ -1365,7 +1365,7 @@ define <7 x half> @rint_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frintx v2.4s, v2.4s
@@ -1664,7 +1664,7 @@ define <7 x half> @round_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frinta v2.4s, v2.4s
@@ -1963,7 +1963,7 @@ define <7 x half> @trunc_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frintz v2.4s, v2.4s
diff --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll
index 5bdcccc..d232ca4 100644
--- a/llvm/test/CodeGen/AArch64/fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fdiv.ll
@@ -199,7 +199,7 @@ define <7 x half> @fdiv_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    fdiv v1.4s, v0.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h0, v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
diff --git a/llvm/test/CodeGen/AArch64/fminimummaximum.ll b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
index fb12f8a..1c7c55d 100644
--- a/llvm/test/CodeGen/AArch64/fminimummaximum.ll
+++ b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
@@ -672,7 +672,7 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov h0, v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
@@ -770,7 +770,7 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov h0, v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
diff --git a/llvm/test/CodeGen/AArch64/fminmax.ll b/llvm/test/CodeGen/AArch64/fminmax.ll
index 64f0da8..da9b572 100644
--- a/llvm/test/CodeGen/AArch64/fminmax.ll
+++ b/llvm/test/CodeGen/AArch64/fminmax.ll
@@ -672,7 +672,7 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov h0, v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
@@ -770,7 +770,7 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov h0, v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll
index a37aabb..ef59209 100644
--- a/llvm/test/CodeGen/AArch64/fmla.ll
+++ b/llvm/test/CodeGen/AArch64/fmla.ll
@@ -268,7 +268,7 @@ define <7 x half> @fma_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v2.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h0, v5.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v6.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v4.4h
@@ -873,7 +873,7 @@ define <7 x half> @fmuladd_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h0, v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v5.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
@@ -1358,7 +1358,7 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h0, v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v5.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
diff --git a/llvm/test/CodeGen/AArch64/fmul.ll b/llvm/test/CodeGen/AArch64/fmul.ll
index bd3d1353..51eba56 100644
--- a/llvm/test/CodeGen/AArch64/fmul.ll
+++ b/llvm/test/CodeGen/AArch64/fmul.ll
@@ -196,7 +196,7 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h0, v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index 9c21d2b..bcebbf4 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -31,6 +31,7 @@ define <1 x i32> @test_signed_v1f32_v1i32(<1 x float> %f) {
 ;
 ; CHECK-GI-LABEL: test_signed_v1f32_v1i32:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fcvtzs w8, s0
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index 44847a4..38895eb 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -31,6 +31,7 @@ define <1 x i32> @test_unsigned_v1f32_v1i32(<1 x float> %f) {
 ;
 ; CHECK-GI-LABEL: test_unsigned_v1f32_v1i32:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fcvtzu w8, s0
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll
index 1f84c94..a428c95 100644
--- a/llvm/test/CodeGen/AArch64/fptrunc.ll
+++ b/llvm/test/CodeGen/AArch64/fptrunc.ll
@@ -263,7 +263,7 @@ define <3 x float> @fptrunc_v3f64_v3f32(<3 x double> %a) {
 ; CHECK-GI-NEXT:    fcvt s2, d2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    fcvtn v1.2s, v0.2d
-; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov s0, v1.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    ret
@@ -354,7 +354,7 @@ define <2 x half> @fptrunc_v2f32_v2f16(<2 x float> %a) {
 ; CHECK-GI-LABEL: fptrunc_v2f32_v2f16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov s1, v0.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fsqrt.ll b/llvm/test/CodeGen/AArch64/fsqrt.ll
index 6c5fd8e..1e888a4 100644
--- a/llvm/test/CodeGen/AArch64/fsqrt.ll
+++ b/llvm/test/CodeGen/AArch64/fsqrt.ll
@@ -203,7 +203,7 @@ define <7 x half> @sqrt_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    fsqrt v2.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll
index 4c34c2f..1af36cc 100644
--- a/llvm/test/CodeGen/AArch64/insertextract.ll
+++ b/llvm/test/CodeGen/AArch64/insertextract.ll
@@ -630,7 +630,7 @@ define <8 x i8> @insert_v8i8_c(<8 x i8> %a, i8 %b, i32 %c) {
 ; CHECK-GI-NEXT:    mov w8, #1 // =0x1
 ; CHECK-GI-NEXT:    str d0, [sp, #8]
 ; CHECK-GI-NEXT:    and x9, x9, #0x7
-; CHECK-GI-NEXT:    mul x8, x9, x8
+; CHECK-GI-NEXT:    umull x8, w9, w8
 ; CHECK-GI-NEXT:    add x9, sp, #8
 ; CHECK-GI-NEXT:    strb w0, [x9, x8]
 ; CHECK-GI-NEXT:    ldr d0, [sp, #8]
@@ -682,7 +682,7 @@ define <16 x i8> @insert_v16i8_c(<16 x i8> %a, i8 %b, i32 %c) {
 ; CHECK-GI-NEXT:    mov w8, #1 // =0x1
 ; CHECK-GI-NEXT:    str q0, [sp]
 ; CHECK-GI-NEXT:    and x9, x9, #0xf
-; CHECK-GI-NEXT:    mul x8, x9, x8
+; CHECK-GI-NEXT:    umull x8, w9, w8
 ; CHECK-GI-NEXT:    mov x9, sp
 ; CHECK-GI-NEXT:    strb w0, [x9, x8]
 ; CHECK-GI-NEXT:    ldr q0, [sp], #16
@@ -1478,16 +1478,11 @@ entry:
 }
 
 define float @extract_v2f32_0(<2 x float> %a, i32 %c) {
-; CHECK-SD-LABEL: extract_v2f32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: extract_v2f32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: extract_v2f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    ret
 entry:
   %d = extractelement <2 x float> %a, i32 0
   ret float %d
@@ -1686,16 +1681,11 @@ entry:
 }
 
 define half @extract_v4f16_0(<4 x half> %a, i32 %c) {
-; CHECK-SD-LABEL: extract_v4f16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 killed $q0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: extract_v4f16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: extract_v4f16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-NEXT:    ret
 entry:
   %d = extractelement <4 x half> %a, i32 0
   ret half %d
@@ -2159,16 +2149,11 @@ entry:
 }
 
 define i32 @extract_v2i32_0(<2 x i32> %a, i32 %c) {
-; CHECK-SD-LABEL: extract_v2i32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: extract_v2i32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: extract_v2i32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
 entry:
   %d = extractelement <2 x i32> %a, i32 0
   ret i32 %d
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index e8194b9..5ec30b6 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -4378,7 +4378,7 @@ define <3 x float> @stofp_v3i64_v3f32(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    scvtf v0.2d, v0.2d
 ; CHECK-GI-NEXT:    fcvtn v2.2s, v2.2d
 ; CHECK-GI-NEXT:    fcvtn v1.2s, v0.2d
-; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov s0, v1.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    ret
@@ -4415,7 +4415,7 @@ define <3 x float> @utofp_v3i64_v3f32(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    ucvtf v0.2d, v0.2d
 ; CHECK-GI-NEXT:    fcvtn v2.2s, v2.2d
 ; CHECK-GI-NEXT:    fcvtn v1.2s, v0.2d
-; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov s0, v1.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    ret
@@ -6393,7 +6393,7 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2d, v0.2d
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -6439,7 +6439,7 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2d, v0.2d
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -7375,7 +7375,7 @@ define <2 x half> @stofp_v2i32_v2f16(<2 x i32> %a) {
 ; CHECK-GI-LABEL: stofp_v2i32_v2f16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    scvtf v0.2s, v0.2s
-; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov s1, v0.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NEXT:    ret
@@ -7395,7 +7395,7 @@ define <2 x half> @utofp_v2i32_v2f16(<2 x i32> %a) {
 ; CHECK-GI-LABEL: utofp_v2i32_v2f16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ucvtf v0.2s, v0.2s
-; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov s1, v0.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NEXT:    ret
@@ -7602,7 +7602,7 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-NOFP16-NEXT:    shl v0.2s, v0.2s, #16
 ; CHECK-GI-NOFP16-NEXT:    sshr v0.2s, v0.2s, #16
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -7637,7 +7637,7 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-NOFP16-NEXT:    movi d1, #0x00ffff0000ffff
 ; CHECK-GI-NOFP16-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -8124,7 +8124,7 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-NOFP16-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-GI-NOFP16-NEXT:    sshr v0.2s, v0.2s, #24
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -8175,7 +8175,7 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-NOFP16-NEXT:    movi d1, #0x0000ff000000ff
 ; CHECK-GI-NOFP16-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/llvm.exp10.ll b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
index c1ea891..9d16555 100644
--- a/llvm/test/CodeGen/AArch64/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
@@ -301,28 +301,17 @@ define float @exp10_f32(float %x) {
 }
 
 define <1 x float> @exp10_v1f32(<1 x float> %x) {
-; SDAG-LABEL: exp10_v1f32:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; SDAG-NEXT:    .cfi_def_cfa_offset 16
-; SDAG-NEXT:    .cfi_offset w30, -16
-; SDAG-NEXT:    // kill: def $d0 killed $d0 def $q0
-; SDAG-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; SDAG-NEXT:    bl exp10f
-; SDAG-NEXT:    // kill: def $s0 killed $s0 def $d0
-; SDAG-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; SDAG-NEXT:    ret
-;
-; GISEL-LABEL: exp10_v1f32:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    .cfi_offset w30, -16
-; GISEL-NEXT:    // kill: def $s0 killed $s0 killed $d0
-; GISEL-NEXT:    bl exp10f
-; GISEL-NEXT:    // kill: def $s0 killed $s0 def $d0
-; GISEL-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; GISEL-NEXT:    ret
+; CHECK-LABEL: exp10_v1f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    bl exp10f
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $d0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
   %r = call <1 x float> @llvm.exp10.v1f32(<1 x float> %x)
   ret <1 x float> %r
 }
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index c158d8a..eded13a 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -655,7 +655,9 @@ define i32 @ctpop_into_extract(ptr %p) {
 ; CHECKO0-NEXT:    // implicit-def: $d2
 ; CHECKO0-NEXT:    fmov s2, w8
 ; CHECKO0-NEXT:    ldr d0, [x0]
-; CHECKO0-NEXT:    fmov s1, s0
+; CHECKO0-NEXT:    // implicit-def: $q1
+; CHECKO0-NEXT:    fmov d1, d0
+; CHECKO0-NEXT:    // kill: def $s1 killed $s1 killed $q1
 ; CHECKO0-NEXT:    fmov w8, s1
 ; CHECKO0-NEXT:    fmov s1, w8
 ; CHECKO0-NEXT:    // kill: def $d1 killed $s1
@@ -725,7 +727,9 @@ define i32 @ctpop_into_extract(ptr %p) {
 ; GISELO0-NEXT:    // implicit-def: $d2
 ; GISELO0-NEXT:    fmov s2, w8
 ; GISELO0-NEXT:    ldr d0, [x0]
-; GISELO0-NEXT:    fmov s1, s0
+; GISELO0-NEXT:    // implicit-def: $q1
+; GISELO0-NEXT:    fmov d1, d0
+; GISELO0-NEXT:    // kill: def $s1 killed $s1 killed $q1
 ; GISELO0-NEXT:    fmov w8, s1
 ; GISELO0-NEXT:    fmov s1, w8
 ; GISELO0-NEXT:    // kill: def $d1 killed $s1
diff --git a/llvm/test/CodeGen/AArch64/ptradd.ll b/llvm/test/CodeGen/AArch64/ptradd.ll
index 28a8f43..4a1c50b 100644
--- a/llvm/test/CodeGen/AArch64/ptradd.ll
+++ b/llvm/test/CodeGen/AArch64/ptradd.ll
@@ -51,6 +51,7 @@ define <1 x ptr> @vector_gep_v1i32(<1 x ptr> %b, <1 x i32> %off) {
 ;
 ; CHECK-GI-LABEL: vector_gep_v1i32:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    fmov w8, s1
 ; CHECK-GI-NEXT:    fmov x9, d0
 ; CHECK-GI-NEXT:    add x8, x9, w8, sxtw
diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll
index 9827cb3..1652eb7 100644
--- a/llvm/test/CodeGen/AArch64/shift.ll
+++ b/llvm/test/CodeGen/AArch64/shift.ll
@@ -595,6 +595,8 @@ define <1 x i32> @shl_v1i32(<1 x i32> %0, <1 x i32> %1){
 ;
 ; CHECK-GI-LABEL: shl_v1i32:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    lsl w8, w8, w9
@@ -771,6 +773,8 @@ define <1 x i32> @ashr_v1i32(<1 x i32> %0, <1 x i32> %1){
 ;
 ; CHECK-GI-LABEL: ashr_v1i32:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    asr w8, w8, w9
@@ -943,6 +947,8 @@ define <1 x i32> @lshr_v1i32(<1 x i32> %0, <1 x i32> %1){
 ;
 ; CHECK-GI-LABEL: lshr_v1i32:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    lsr w8, w8, w9
diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
index 59cc400..9fd5e65 100644
--- a/llvm/test/CodeGen/AArch64/shufflevector.ll
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -399,7 +399,8 @@ define <3 x ptr> @shufflevector_v3p0(<3 x ptr> %a, <3 x ptr> %b) {
 ; CHECK-GI-NEXT:    fmov x9, d4
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $d3 killed $d3 def $q3
-; CHECK-GI-NEXT:    fmov d2, d5
+; CHECK-GI-NEXT:    // kill: def $d5 killed $d5 def $q5
+; CHECK-GI-NEXT:    dup v2.2d, v5.d[0]
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    mov v3.d[1], x9
 ; CHECK-GI-NEXT:    ext v0.16b, v0.16b, v3.16b, #8
diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll
index 3a9f12b..0fe1ef5 100644
--- a/llvm/test/CodeGen/AArch64/store.ll
+++ b/llvm/test/CodeGen/AArch64/store.ll
@@ -167,16 +167,11 @@ define void @store_v16i16(<16 x i16> %a, ptr %ptr){
 }
 
 define void @store_v1i32(<1 x i32> %a, ptr %ptr){
-; CHECK-SD-LABEL: store_v1i32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    str s0, [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: store_v1i32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str s0, [x0]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: store_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    str s0, [x0]
+; CHECK-NEXT:    ret
     store <1 x i32> %a, ptr %ptr
     ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-crypto.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-crypto.ll
index fe8271c..f477fcb 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-crypto.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-crypto.ll
@@ -16,6 +16,17 @@ define <vscale x 16 x i8> @aesd_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
   ret <vscale x 16 x i8> %out
 }
 
+define <vscale x 16 x i8> @aesd_i8_commuted(<vscale x 16 x i8> %a,
+; CHECK-LABEL: aesd_i8_commuted:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    aesd z0.b, z0.b, z1.b
+; CHECK-NEXT:    ret
+                                            <vscale x 16 x i8> %b) {
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.aesd(<vscale x 16 x i8> %b,
+                                                        <vscale x 16 x i8> %a)
+  ret <vscale x 16 x i8> %out
+}
+
 ;
 ; AESIMC
 ;
@@ -43,6 +54,17 @@ define <vscale x 16 x i8> @aese_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
   ret <vscale x 16 x i8> %out
 }
 
+define <vscale x 16 x i8> @aese_i8_commuted(<vscale x 16 x i8> %a,
+; CHECK-LABEL: aese_i8_commuted:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    aese z0.b, z0.b, z1.b
+; CHECK-NEXT:    ret
+                                            <vscale x 16 x i8> %b) {
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.aese(<vscale x 16 x i8> %b,
+                                                        <vscale x 16 x i8> %a)
+  ret <vscale x 16 x i8> %out
+}
+
 ;
 ; AESMC
 ;
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 77483eb..63e26a2 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -930,195 +930,85 @@ define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind {
 ; CHECK-GI-LABEL: vector_to_vector_cast:
 ; CHECK-GI:       ; %bb.0:
 ; CHECK-GI-NEXT:    sub sp, sp, #16
-; CHECK-GI-NEXT:    umov.b w8, v0[1]
-; CHECK-GI-NEXT:    mov d1, v0[1]
 ; CHECK-GI-NEXT:    umov.b w10, v0[1]
-; CHECK-GI-NEXT:    umov.b w9, v0[0]
-; CHECK-GI-NEXT:    umov.b w13, v0[0]
-; CHECK-GI-NEXT:    umov.b w14, v0[2]
+; CHECK-GI-NEXT:    umov.b w9, v0[1]
+; CHECK-GI-NEXT:    mov d1, v0[1]
+; CHECK-GI-NEXT:    umov.b w8, v0[0]
+; CHECK-GI-NEXT:    umov.b w11, v0[0]
+; CHECK-GI-NEXT:    umov.b w12, v0[2]
+; CHECK-GI-NEXT:    umov.b w13, v0[2]
 ; CHECK-GI-NEXT:    umov.b w15, v0[3]
-; CHECK-GI-NEXT:    umov.b w11, v0[2]
 ; CHECK-GI-NEXT:    umov.b w16, v0[4]
-; CHECK-GI-NEXT:    umov.b w17, v0[5]
-; CHECK-GI-NEXT:    umov.b w12, v0[3]
-; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[3]
 ; CHECK-GI-NEXT:    and w10, w10, #0x1
-; CHECK-GI-NEXT:    umov.b w0, v1[1]
-; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
-; CHECK-GI-NEXT:    bfi w13, w10, #1, #31
-; CHECK-GI-NEXT:    and w14, w14, #0x1
-; CHECK-GI-NEXT:    umov.b w8, v1[0]
-; CHECK-GI-NEXT:    umov.b w10, v1[2]
+; CHECK-GI-NEXT:    and w9, w9, #0x1
+; CHECK-GI-NEXT:    bfi w8, w10, #1, #31
+; CHECK-GI-NEXT:    umov.b w10, v1[1]
+; CHECK-GI-NEXT:    and w12, w12, #0x1
+; CHECK-GI-NEXT:    bfi w11, w9, #1, #31
+; CHECK-GI-NEXT:    umov.b w9, v1[0]
+; CHECK-GI-NEXT:    and w13, w13, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w12, lsl #2
+; CHECK-GI-NEXT:    umov.b w12, v1[2]
 ; CHECK-GI-NEXT:    and w15, w15, #0x1
-; CHECK-GI-NEXT:    orr w13, w13, w14, lsl #2
-; CHECK-GI-NEXT:    umov.b w14, v1[3]
-; CHECK-GI-NEXT:    and w11, w11, #0x1
-; CHECK-GI-NEXT:    and w0, w0, #0x1
+; CHECK-GI-NEXT:    orr w11, w11, w13, lsl #2
+; CHECK-GI-NEXT:    umov.b w13, v0[5]
 ; CHECK-GI-NEXT:    and w16, w16, #0x1
-; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #2
-; CHECK-GI-NEXT:    orr w13, w13, w15, lsl #3
-; CHECK-GI-NEXT:    umov.b w15, v1[4]
-; CHECK-GI-NEXT:    umov.b w11, v0[6]
-; CHECK-GI-NEXT:    bfi w8, w0, #1, #31
+; CHECK-GI-NEXT:    orr w8, w8, w15, lsl #3
+; CHECK-GI-NEXT:    umov.b w15, v1[3]
 ; CHECK-GI-NEXT:    and w10, w10, #0x1
-; CHECK-GI-NEXT:    and w17, w17, #0x1
-; CHECK-GI-NEXT:    orr w13, w13, w16, lsl #4
+; CHECK-GI-NEXT:    bfi w9, w10, #1, #31
+; CHECK-GI-NEXT:    umov.b w10, v0[6]
 ; CHECK-GI-NEXT:    and w14, w14, #0x1
-; CHECK-GI-NEXT:    umov.b w0, v0[7]
-; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #2
-; CHECK-GI-NEXT:    umov.b w10, v1[5]
-; CHECK-GI-NEXT:    umov.b w16, v1[6]
-; CHECK-GI-NEXT:    orr w13, w13, w17, lsl #5
-; CHECK-GI-NEXT:    umov.b w17, v0[4]
-; CHECK-GI-NEXT:    and w15, w15, #0x1
-; CHECK-GI-NEXT:    orr w8, w8, w14, lsl #3
+; CHECK-GI-NEXT:    orr w8, w8, w16, lsl #4
+; CHECK-GI-NEXT:    umov.b w16, v1[4]
 ; CHECK-GI-NEXT:    and w12, w12, #0x1
-; CHECK-GI-NEXT:    and w11, w11, #0x1
-; CHECK-GI-NEXT:    umov.b w14, v1[7]
-; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #3
-; CHECK-GI-NEXT:    orr w11, w13, w11, lsl #6
-; CHECK-GI-NEXT:    orr w8, w8, w15, lsl #4
-; CHECK-GI-NEXT:    umov.b w15, v0[5]
-; CHECK-GI-NEXT:    and w10, w10, #0x1
-; CHECK-GI-NEXT:    and w0, w0, #0x1
-; CHECK-GI-NEXT:    and w12, w17, #0x1
-; CHECK-GI-NEXT:    umov.b w13, v0[1]
-; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #5
-; CHECK-GI-NEXT:    and w16, w16, #0x1
-; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #4
-; CHECK-GI-NEXT:    umov.b w10, v0[0]
-; CHECK-GI-NEXT:    orr w11, w11, w0, lsl #7
-; CHECK-GI-NEXT:    and w14, w14, #0x1
-; CHECK-GI-NEXT:    and w12, w15, #0x1
-; CHECK-GI-NEXT:    umov.b w15, v0[2]
-; CHECK-GI-NEXT:    orr w8, w8, w16, lsl #6
-; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #5
-; CHECK-GI-NEXT:    umov.b w12, v0[6]
-; CHECK-GI-NEXT:    strb w11, [sp, #8]
-; CHECK-GI-NEXT:    and w11, w13, #0x1
-; CHECK-GI-NEXT:    umov.b w13, v0[3]
-; CHECK-GI-NEXT:    orr w8, w8, w14, lsl #7
-; CHECK-GI-NEXT:    umov.b w14, v0[7]
-; CHECK-GI-NEXT:    ldr b0, [sp, #8]
-; CHECK-GI-NEXT:    bfi w10, w11, #1, #31
-; CHECK-GI-NEXT:    and w11, w15, #0x1
-; CHECK-GI-NEXT:    strb w8, [sp, #9]
-; CHECK-GI-NEXT:    umov.b w15, v0[4]
-; CHECK-GI-NEXT:    and w8, w12, #0x1
-; CHECK-GI-NEXT:    orr w10, w10, w11, lsl #2
-; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #6
-; CHECK-GI-NEXT:    and w9, w13, #0x1
-; CHECK-GI-NEXT:    umov.b w11, v0[1]
-; CHECK-GI-NEXT:    orr w9, w10, w9, lsl #3
-; CHECK-GI-NEXT:    umov.b w10, v0[5]
-; CHECK-GI-NEXT:    umov.b w12, v0[0]
-; CHECK-GI-NEXT:    and w13, w14, #0x1
-; CHECK-GI-NEXT:    umov.b w16, v0[2]
-; CHECK-GI-NEXT:    umov.b w17, v0[3]
-; CHECK-GI-NEXT:    and w14, w15, #0x1
-; CHECK-GI-NEXT:    umov.b w15, v0[2]
-; CHECK-GI-NEXT:    orr w8, w8, w13, lsl #7
-; CHECK-GI-NEXT:    orr w9, w9, w14, lsl #4
-; CHECK-GI-NEXT:    umov.b w13, v0[6]
-; CHECK-GI-NEXT:    and w11, w11, #0x1
-; CHECK-GI-NEXT:    umov.b w14, v0[3]
-; CHECK-GI-NEXT:    strb w8, [sp, #10]
-; CHECK-GI-NEXT:    and w8, w10, #0x1
-; CHECK-GI-NEXT:    bfi w12, w11, #1, #31
-; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #5
-; CHECK-GI-NEXT:    umov.b w10, v0[4]
-; CHECK-GI-NEXT:    and w9, w15, #0x1
-; CHECK-GI-NEXT:    umov.b w11, v0[7]
-; CHECK-GI-NEXT:    umov.b w15, v0[1]
-; CHECK-GI-NEXT:    orr w9, w12, w9, lsl #2
-; CHECK-GI-NEXT:    umov.b w12, v0[5]
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #2
 ; CHECK-GI-NEXT:    and w13, w13, #0x1
-; CHECK-GI-NEXT:    and w14, w14, #0x1
-; CHECK-GI-NEXT:    orr w8, w8, w13, lsl #6
-; CHECK-GI-NEXT:    umov.b w13, v0[0]
-; CHECK-GI-NEXT:    orr w9, w9, w14, lsl #3
-; CHECK-GI-NEXT:    and w10, w10, #0x1
-; CHECK-GI-NEXT:    umov.b w14, v0[6]
-; CHECK-GI-NEXT:    and w11, w11, #0x1
-; CHECK-GI-NEXT:    and w15, w15, #0x1
-; CHECK-GI-NEXT:    umov.b w0, v0[3]
-; CHECK-GI-NEXT:    orr w9, w9, w10, lsl #4
-; CHECK-GI-NEXT:    and w10, w12, #0x1
-; CHECK-GI-NEXT:    umov.b w12, v0[7]
-; CHECK-GI-NEXT:    orr w8, w8, w11, lsl #7
-; CHECK-GI-NEXT:    bfi w13, w15, #1, #31
-; CHECK-GI-NEXT:    and w11, w16, #0x1
-; CHECK-GI-NEXT:    orr w9, w9, w10, lsl #5
-; CHECK-GI-NEXT:    and w10, w14, #0x1
-; CHECK-GI-NEXT:    umov.b w14, v0[4]
-; CHECK-GI-NEXT:    strb w8, [sp, #11]
-; CHECK-GI-NEXT:    umov.b w15, v0[1]
-; CHECK-GI-NEXT:    umov.b w16, v0[3]
-; CHECK-GI-NEXT:    orr w8, w9, w10, lsl #6
-; CHECK-GI-NEXT:    orr w9, w13, w11, lsl #2
-; CHECK-GI-NEXT:    and w10, w12, #0x1
-; CHECK-GI-NEXT:    and w11, w17, #0x1
-; CHECK-GI-NEXT:    umov.b w12, v0[5]
-; CHECK-GI-NEXT:    umov.b w17, v0[0]
-; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #7
-; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #3
-; CHECK-GI-NEXT:    umov.b w10, v0[1]
-; CHECK-GI-NEXT:    and w11, w14, #0x1
-; CHECK-GI-NEXT:    umov.b w14, v0[0]
+; CHECK-GI-NEXT:    umov.b w12, v0[4]
+; CHECK-GI-NEXT:    orr w8, w8, w13, lsl #5
+; CHECK-GI-NEXT:    umov.b w13, v1[5]
 ; CHECK-GI-NEXT:    and w15, w15, #0x1
-; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #4
-; CHECK-GI-NEXT:    umov.b w11, v0[2]
-; CHECK-GI-NEXT:    umov.b w13, v0[6]
-; CHECK-GI-NEXT:    and w12, w12, #0x1
-; CHECK-GI-NEXT:    bfi w17, w15, #1, #31
-; CHECK-GI-NEXT:    umov.b w15, v0[5]
-; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #5
+; CHECK-GI-NEXT:    orr w9, w9, w15, lsl #3
 ; CHECK-GI-NEXT:    and w10, w10, #0x1
-; CHECK-GI-NEXT:    umov.b w12, v0[2]
-; CHECK-GI-NEXT:    bfi w14, w10, #1, #31
-; CHECK-GI-NEXT:    umov.b w10, v0[4]
-; CHECK-GI-NEXT:    ldr b1, [sp, #9]
-; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w15, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #6
+; CHECK-GI-NEXT:    umov.b w10, v1[6]
+; CHECK-GI-NEXT:    and w16, w16, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w16, lsl #4
+; CHECK-GI-NEXT:    umov.b w16, v0[5]
+; CHECK-GI-NEXT:    orr w11, w11, w14, lsl #3
 ; CHECK-GI-NEXT:    and w13, w13, #0x1
-; CHECK-GI-NEXT:    strb w8, [sp, #12]
-; CHECK-GI-NEXT:    orr w11, w14, w11, lsl #2
-; CHECK-GI-NEXT:    and w14, w16, #0x1
-; CHECK-GI-NEXT:    umov.b w16, v0[4]
+; CHECK-GI-NEXT:    umov.b w14, v1[7]
 ; CHECK-GI-NEXT:    and w12, w12, #0x1
-; CHECK-GI-NEXT:    and w15, w15, #0x1
-; CHECK-GI-NEXT:    orr w9, w9, w13, lsl #6
-; CHECK-GI-NEXT:    orr w11, w11, w14, lsl #3
-; CHECK-GI-NEXT:    orr w12, w17, w12, lsl #2
+; CHECK-GI-NEXT:    orr w9, w9, w13, lsl #5
+; CHECK-GI-NEXT:    umov.b w13, v0[6]
+; CHECK-GI-NEXT:    orr w11, w11, w12, lsl #4
 ; CHECK-GI-NEXT:    and w10, w10, #0x1
-; CHECK-GI-NEXT:    and w17, w0, #0x1
-; CHECK-GI-NEXT:    umov.b w0, v0[5]
-; CHECK-GI-NEXT:    umov.b w14, v0[6]
-; CHECK-GI-NEXT:    orr w10, w11, w10, lsl #4
-; CHECK-GI-NEXT:    orr w12, w12, w17, lsl #3
-; CHECK-GI-NEXT:    umov.b w11, v0[7]
-; CHECK-GI-NEXT:    and w16, w16, #0x1
-; CHECK-GI-NEXT:    umov.b w17, v0[6]
-; CHECK-GI-NEXT:    orr w10, w10, w15, lsl #5
+; CHECK-GI-NEXT:    and w12, w15, #0x1
 ; CHECK-GI-NEXT:    umov.b w15, v0[7]
-; CHECK-GI-NEXT:    orr w12, w12, w16, lsl #4
-; CHECK-GI-NEXT:    and w16, w0, #0x1
-; CHECK-GI-NEXT:    umov.b w0, v0[7]
-; CHECK-GI-NEXT:    and w14, w14, #0x1
-; CHECK-GI-NEXT:    orr w12, w12, w16, lsl #5
-; CHECK-GI-NEXT:    orr w10, w10, w14, lsl #6
-; CHECK-GI-NEXT:    and w11, w11, #0x1
-; CHECK-GI-NEXT:    and w13, w17, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w10, lsl #6
+; CHECK-GI-NEXT:    and w10, w16, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w12, lsl #7
+; CHECK-GI-NEXT:    orr w10, w11, w10, lsl #5
+; CHECK-GI-NEXT:    and w11, w14, #0x1
 ; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #7
+; CHECK-GI-NEXT:    and w11, w13, #0x1
+; CHECK-GI-NEXT:    strb w8, [sp, #8]
+; CHECK-GI-NEXT:    orr w8, w10, w11, lsl #6
+; CHECK-GI-NEXT:    ldr b0, [sp, #8]
+; CHECK-GI-NEXT:    strb w9, [sp, #9]
+; CHECK-GI-NEXT:    and w9, w15, #0x1
+; CHECK-GI-NEXT:    ldr b1, [sp, #9]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #7
 ; CHECK-GI-NEXT:    mov.s v0[1], v1[0]
-; CHECK-GI-NEXT:    orr w11, w12, w13, lsl #6
-; CHECK-GI-NEXT:    and w12, w15, #0x1
+; CHECK-GI-NEXT:    strb w8, [sp, #10]
+; CHECK-GI-NEXT:    strb w8, [sp, #11]
 ; CHECK-GI-NEXT:    ; kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    orr w8, w10, w12, lsl #7
-; CHECK-GI-NEXT:    and w10, w0, #0x1
-; CHECK-GI-NEXT:    strb w9, [sp, #13]
-; CHECK-GI-NEXT:    orr w9, w11, w10, lsl #7
+; CHECK-GI-NEXT:    strb w8, [sp, #12]
+; CHECK-GI-NEXT:    strb w8, [sp, #13]
 ; CHECK-GI-NEXT:    strb w8, [sp, #14]
-; CHECK-GI-NEXT:    strb w9, [sp, #15]
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
 ; CHECK-GI-NEXT:    add sp, sp, #16
 ; CHECK-GI-NEXT:    ret
   %bc = bitcast <16 x i1> %arg to <2 x i8>
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
index 1164e02..bd68b21 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
@@ -79,11 +79,10 @@ define half @add_HalfH(<4 x half> %bin.rdx)  {
 ; CHECK-GI-FP16-LABEL: add_HalfH:
 ; CHECK-GI-FP16:       // %bb.0:
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fadd h1, h0, h1
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[2]
+; CHECK-GI-FP16-NEXT:    faddp h2, v0.2h
 ; CHECK-GI-FP16-NEXT:    mov h0, v0.h[3]
-; CHECK-GI-FP16-NEXT:    fadd h1, h1, h2
+; CHECK-GI-FP16-NEXT:    fadd h1, h2, h1
 ; CHECK-GI-FP16-NEXT:    fadd h0, h1, h0
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %bin.rdx)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
index 1d295a3..1906ca9 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
@@ -44,27 +44,11 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 }
 
 define float @test_v1f32(<1 x float> %a) nounwind {
-; CHECK-NOFP-SD-LABEL: test_v1f32:
-; CHECK-NOFP-SD:       // %bb.0:
-; CHECK-NOFP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NOFP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NOFP-SD-NEXT:    ret
-;
-; CHECK-FP-SD-LABEL: test_v1f32:
-; CHECK-FP-SD:       // %bb.0:
-; CHECK-FP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-FP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-FP-SD-NEXT:    ret
-;
-; CHECK-NOFP-GI-LABEL: test_v1f32:
-; CHECK-NOFP-GI:       // %bb.0:
-; CHECK-NOFP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
-; CHECK-NOFP-GI-NEXT:    ret
-;
-; CHECK-FP-GI-LABEL: test_v1f32:
-; CHECK-FP-GI:       // %bb.0:
-; CHECK-FP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
-; CHECK-FP-GI-NEXT:    ret
+; CHECK-LABEL: test_v1f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    ret
   %b = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
index ee2af11..152eb66 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
@@ -44,27 +44,11 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 }
 
 define float @test_v1f32(<1 x float> %a) nounwind {
-; CHECK-NOFP-SD-LABEL: test_v1f32:
-; CHECK-NOFP-SD:       // %bb.0:
-; CHECK-NOFP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NOFP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NOFP-SD-NEXT:    ret
-;
-; CHECK-FP-SD-LABEL: test_v1f32:
-; CHECK-FP-SD:       // %bb.0:
-; CHECK-FP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-FP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-FP-SD-NEXT:    ret
-;
-; CHECK-NOFP-GI-LABEL: test_v1f32:
-; CHECK-NOFP-GI:       // %bb.0:
-; CHECK-NOFP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
-; CHECK-NOFP-GI-NEXT:    ret
-;
-; CHECK-FP-GI-LABEL: test_v1f32:
-; CHECK-FP-GI:       // %bb.0:
-; CHECK-FP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
-; CHECK-FP-GI-NEXT:    ret
+; CHECK-LABEL: test_v1f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    ret
   %b = call nnan float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll
index be61f9b..a1b7118 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll
@@ -40,27 +40,11 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 }
 
 define float @test_v1f32(<1 x float> %a) nounwind {
-; CHECK-NOFP-SD-LABEL: test_v1f32:
-; CHECK-NOFP-SD:       // %bb.0:
-; CHECK-NOFP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NOFP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NOFP-SD-NEXT:    ret
-;
-; CHECK-FP-SD-LABEL: test_v1f32:
-; CHECK-FP-SD:       // %bb.0:
-; CHECK-FP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-FP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-FP-SD-NEXT:    ret
-;
-; CHECK-NOFP-GI-LABEL: test_v1f32:
-; CHECK-NOFP-GI:       // %bb.0:
-; CHECK-NOFP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
-; CHECK-NOFP-GI-NEXT:    ret
-;
-; CHECK-FP-GI-LABEL: test_v1f32:
-; CHECK-FP-GI:       // %bb.0:
-; CHECK-FP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
-; CHECK-FP-GI-NEXT:    ret
+; CHECK-LABEL: test_v1f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    ret
   %b = call float @llvm.vector.reduce.fmaximum.v1f32(<1 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
index 300081d..d5f999ad 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
@@ -44,27 +44,11 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 }
 
 define float @test_v1f32(<1 x float> %a) nounwind {
-; CHECK-NOFP-SD-LABEL: test_v1f32:
-; CHECK-NOFP-SD:       // %bb.0:
-; CHECK-NOFP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NOFP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NOFP-SD-NEXT:    ret
-;
-; CHECK-FP-SD-LABEL: test_v1f32:
-; CHECK-FP-SD:       // %bb.0:
-; CHECK-FP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-FP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-FP-SD-NEXT:    ret
-;
-; CHECK-NOFP-GI-LABEL: test_v1f32:
-; CHECK-NOFP-GI:       // %bb.0:
-; CHECK-NOFP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
-; CHECK-NOFP-GI-NEXT:    ret
-;
-; CHECK-FP-GI-LABEL: test_v1f32:
-; CHECK-FP-GI:       // %bb.0:
-; CHECK-FP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
-; CHECK-FP-GI-NEXT:    ret
+; CHECK-LABEL: test_v1f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    ret
   %b = call nnan float @llvm.vector.reduce.fmin.v1f32(<1 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll b/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll
index e735f670..719cac8 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll
@@ -40,27 +40,11 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 }
 
 define float @test_v1f32(<1 x float> %a) nounwind {
-; CHECK-NOFP-SD-LABEL: test_v1f32:
-; CHECK-NOFP-SD:       // %bb.0:
-; CHECK-NOFP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NOFP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NOFP-SD-NEXT:    ret
-;
-; CHECK-FP-SD-LABEL: test_v1f32:
-; CHECK-FP-SD:       // %bb.0:
-; CHECK-FP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-FP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-FP-SD-NEXT:    ret
-;
-; CHECK-NOFP-GI-LABEL: test_v1f32:
-; CHECK-NOFP-GI:       // %bb.0:
-; CHECK-NOFP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
-; CHECK-NOFP-GI-NEXT:    ret
-;
-; CHECK-FP-GI-LABEL: test_v1f32:
-; CHECK-FP-GI:       // %bb.0:
-; CHECK-FP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
-; CHECK-FP-GI-NEXT:    ret
+; CHECK-LABEL: test_v1f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    ret
   %b = call float @llvm.vector.reduce.fminimum.v1f32(<1 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
index e1b2170..e22a5a4 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
@@ -5,18 +5,11 @@
 ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define float @mul_HalfS(<2 x float> %bin.rdx)  {
-; CHECK-SD-LABEL: mul_HalfS:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: mul_HalfS:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmul s0, s0, s1
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: mul_HalfS:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-NEXT:    ret
   %r = call float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %bin.rdx)
   ret float %r
 }
@@ -79,12 +72,9 @@ define half @mul_HalfH(<4 x half> %bin.rdx)  {
 ; CHECK-GI-FP16-LABEL: mul_HalfH:
 ; CHECK-GI-FP16:       // %bb.0:
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fmul h1, h0, h1
-; CHECK-GI-FP16-NEXT:    mov h0, v0.h[3]
-; CHECK-GI-FP16-NEXT:    fmul h1, h1, h2
-; CHECK-GI-FP16-NEXT:    fmul h0, h1, h0
+; CHECK-GI-FP16-NEXT:    fmul h1, h0, v0.h[1]
+; CHECK-GI-FP16-NEXT:    fmul h1, h1, v0.h[2]
+; CHECK-GI-FP16-NEXT:    fmul h0, h1, v0.h[3]
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %bin.rdx)
   ret half %r
@@ -475,3 +465,6 @@ declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
 declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
 declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
 declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-GI: {{.*}}
+; CHECK-SD: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
index 2429cf4..5fd705b 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
@@ -5,18 +5,11 @@
 ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define float @mul_HalfS(<2 x float> %bin.rdx)  {
-; CHECK-SD-LABEL: mul_HalfS:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: mul_HalfS:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmul s0, s0, s1
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: mul_HalfS:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-NEXT:    ret
   %r = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %bin.rdx)
   ret float %r
 }
@@ -51,20 +44,17 @@ define half @mul_HalfH(<4 x half> %bin.rdx)  {
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: mul_HalfH:
 ; CHECK-GI-FP16:       // %bb.0:
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
-; CHECK-GI-FP16-NEXT:    fmul h1, h2, h3
-; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h2, h0, v0.h[1]
+; CHECK-GI-FP16-NEXT:    fmul h0, h1, v0.h[2]
+; CHECK-GI-FP16-NEXT:    fmul h0, h2, h0
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call fast half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %bin.rdx)
   ret half %r
@@ -115,8 +105,7 @@ define half @mul_H(<8 x half> %bin.rdx)  {
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.4s, v1.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -124,12 +113,10 @@ define half @mul_H(<8 x half> %bin.rdx)  {
 ; CHECK-GI-FP16:       // %bb.0:
 ; CHECK-GI-FP16-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-FP16-NEXT:    fmul v0.4h, v0.4h, v1.4h
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
-; CHECK-GI-FP16-NEXT:    fmul h1, h2, h3
-; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h2, h0, v0.h[1]
+; CHECK-GI-FP16-NEXT:    fmul h0, h1, v0.h[2]
+; CHECK-GI-FP16-NEXT:    fmul h0, h2, h0
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %bin.rdx)
   ret half %r
@@ -147,8 +134,7 @@ define float @mul_S(<4 x float> %bin.rdx)  {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
 ; CHECK-GI-NEXT:    ret
   %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %bin.rdx)
   ret float %r
@@ -220,8 +206,7 @@ define half @mul_2H(<16 x half> %bin.rdx)  {
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -230,12 +215,10 @@ define half @mul_2H(<16 x half> %bin.rdx)  {
 ; CHECK-GI-FP16-NEXT:    fmul v0.8h, v0.8h, v1.8h
 ; CHECK-GI-FP16-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-FP16-NEXT:    fmul v0.4h, v0.4h, v1.4h
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
-; CHECK-GI-FP16-NEXT:    fmul h1, h2, h3
-; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h2, h0, v0.h[1]
+; CHECK-GI-FP16-NEXT:    fmul h0, h1, v0.h[2]
+; CHECK-GI-FP16-NEXT:    fmul h0, h2, h0
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call fast half @llvm.vector.reduce.fmul.f16.v16f16(half 1.0, <16 x half> %bin.rdx)
   ret half %r
@@ -255,8 +238,7 @@ define float @mul_2S(<8 x float> %bin.rdx)  {
 ; CHECK-GI-NEXT:    fmul v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
 ; CHECK-GI-NEXT:    ret
   %r = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %bin.rdx)
   ret float %r
@@ -289,9 +271,8 @@ define float @mul_S_init_42(<4 x float> %bin.rdx)  {
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    mov w8, #1109917696 // =0x42280000
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 42.0, <4 x float> %bin.rdx)
@@ -357,10 +338,8 @@ define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NOFP16-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov s3, v1.s[1]
-; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s2
-; CHECK-GI-NOFP16-NEXT:    fmul s1, s1, s3
+; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fmul s1, s1, v1.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-GI-NOFP16-NEXT:    fcvt h1, s1
 ; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
@@ -375,18 +354,14 @@ define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
 ; CHECK-GI-FP16-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-FP16-NEXT:    fmul v0.4h, v0.4h, v2.4h
 ; CHECK-GI-FP16-NEXT:    fmul v1.4h, v1.4h, v3.4h
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-FP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov h5, v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov h6, v1.h[2]
-; CHECK-GI-FP16-NEXT:    mov h7, v1.h[3]
-; CHECK-GI-FP16-NEXT:    fmul h0, h0, h2
-; CHECK-GI-FP16-NEXT:    fmul h2, h3, h4
-; CHECK-GI-FP16-NEXT:    fmul h1, h1, h5
-; CHECK-GI-FP16-NEXT:    fmul h3, h6, h7
-; CHECK-GI-FP16-NEXT:    fmul h0, h0, h2
-; CHECK-GI-FP16-NEXT:    fmul h1, h1, h3
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov h3, v1.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h4, h0, v0.h[1]
+; CHECK-GI-FP16-NEXT:    fmul h0, h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    fmul h2, h1, v1.h[1]
+; CHECK-GI-FP16-NEXT:    fmul h1, h3, v1.h[2]
+; CHECK-GI-FP16-NEXT:    fmul h0, h4, h0
+; CHECK-GI-FP16-NEXT:    fmul h1, h2, h1
 ; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
 ; CHECK-GI-FP16-NEXT:    ret
   %r1 = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %a)
@@ -414,10 +389,8 @@ define float @fmul_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov s3, v1.s[1]
-; CHECK-GI-NEXT:    fmul s0, s0, s2
-; CHECK-GI-NEXT:    fmul s1, s1, s3
+; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a)
@@ -441,10 +414,8 @@ define float @fmul_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov s3, v1.s[1]
-; CHECK-GI-NEXT:    fmul s0, s0, s2
-; CHECK-GI-NEXT:    fmul s1, s1, s3
+; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
@@ -471,12 +442,10 @@ define float @fmul_reduct_reassoc_v4f32_init(float %i, <4 x float> %a, <4 x floa
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
 ; CHECK-GI-NEXT:    mov d3, v2.d[1]
-; CHECK-GI-NEXT:    mov s4, v1.s[1]
+; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
 ; CHECK-GI-NEXT:    fmul v2.2s, v2.2s, v3.2s
-; CHECK-GI-NEXT:    fmul s1, s1, s4
-; CHECK-GI-NEXT:    mov s3, v2.s[1]
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
-; CHECK-GI-NEXT:    fmul s1, s2, s3
+; CHECK-GI-NEXT:    fmul s1, s2, v2.s[1]
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %i, <4 x float> %a)
@@ -502,10 +471,8 @@ define float @fmul_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) {
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov s3, v1.s[1]
-; CHECK-GI-NEXT:    fmul s0, s0, s2
-; CHECK-GI-NEXT:    fmul s1, s1, s3
+; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
@@ -556,10 +523,8 @@ define float @fmul_reduct_reassoc_v4f32_extrause(<4 x float> %a, <4 x float> %b)
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov s3, v1.s[1]
-; CHECK-GI-NEXT:    fmul s0, s0, s2
-; CHECK-GI-NEXT:    fmul s1, s1, s3
+; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
 ; CHECK-GI-NEXT:    fmul s1, s0, s1
 ; CHECK-GI-NEXT:    fmul s0, s1, s0
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
index 0806f7d..d5c040e 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
@@ -57,16 +57,11 @@ define i24 @test_v1i24(<1 x i24> %a) nounwind {
 }
 
 define i32 @test_v1i32(<1 x i32> %a) nounwind {
-; CHECK-SD-LABEL: test_v1i32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_v1i32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %b = call i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %a)
   ret i32 %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vector-lrint.ll b/llvm/test/CodeGen/AArch64/vector-lrint.ll
index 6026432..53456c4 100644
--- a/llvm/test/CodeGen/AArch64/vector-lrint.ll
+++ b/llvm/test/CodeGen/AArch64/vector-lrint.ll
@@ -755,20 +755,13 @@ define <1 x iXLen> @lrint_v1f32(<1 x float> %x) {
 ; CHECK-i32-NEXT:    fcvtzs v0.2s, v0.2s
 ; CHECK-i32-NEXT:    ret
 ;
-; CHECK-i64-SD-LABEL: lrint_v1f32:
-; CHECK-i64-SD:       // %bb.0:
-; CHECK-i64-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-i64-SD-NEXT:    frintx s0, s0
-; CHECK-i64-SD-NEXT:    fcvtzs x8, s0
-; CHECK-i64-SD-NEXT:    fmov d0, x8
-; CHECK-i64-SD-NEXT:    ret
-;
-; CHECK-i64-GI-LABEL: lrint_v1f32:
-; CHECK-i64-GI:       // %bb.0:
-; CHECK-i64-GI-NEXT:    frintx s0, s0
-; CHECK-i64-GI-NEXT:    fcvtzs x8, s0
-; CHECK-i64-GI-NEXT:    fmov d0, x8
-; CHECK-i64-GI-NEXT:    ret
+; CHECK-i64-LABEL: lrint_v1f32:
+; CHECK-i64:       // %bb.0:
+; CHECK-i64-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-i64-NEXT:    frintx s0, s0
+; CHECK-i64-NEXT:    fcvtzs x8, s0
+; CHECK-i64-NEXT:    fmov d0, x8
+; CHECK-i64-NEXT:    ret
   %a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1f32(<1 x float> %x)
   ret <1 x iXLen> %a
 }
@@ -1335,3 +1328,7 @@ define <32 x iXLen> @lrint_v32f64(<32 x double> %x) {
   ret <32 x iXLen> %a
 }
 declare <32 x iXLen> @llvm.lrint.v32iXLen.v32f64(<32 x double>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-i32-GI: {{.*}}
+; CHECK-i64-GI: {{.*}}
+; CHECK-i64-SD: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll b/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll
index cc2c800..78766b4 100644
--- a/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll
+++ b/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll
@@ -276,3 +276,28 @@ define void @kernel_argument_promotion_pattern_inter_procedure(ptr %p, i32 %val)
   call void @use_argument_after_promotion(ptr %p.cast.1, i32 %val)
   ret void
 }
+
+define amdgpu_kernel void @kernel_argument_with_known_as(ptr addrspace(1) %p1, ptr addrspace(3) %p3, i32 %val) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_argument_with_known_as(
+; CHECK-SAME: ptr addrspace(1) [[P1:%.*]], ptr addrspace(3) [[P3:%.*]], i32 [[VAL:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[P1_CAST:%.*]] = addrspacecast ptr addrspace(1) [[P1]] to ptr
+; CHECK-NEXT:    [[P3_CAST:%.*]] = addrspacecast ptr addrspace(3) [[P3]] to ptr
+; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[VAL]], 0
+; CHECK-NEXT:    [[P:%.*]] = select i1 [[B]], ptr [[P1_CAST]], ptr [[P3_CAST]]
+; CHECK-NEXT:    [[ATOMIC_ADD:%.*]] = atomicrmw add ptr [[P]], i32 1 syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0:![0-9]+]], !amdgpu.no.fine.grained.memory [[META1:![0-9]+]], !amdgpu.no.remote.memory [[META1]]
+; CHECK-NEXT:    ret void
+;
+  %p1.cast = addrspacecast ptr addrspace(1) %p1 to ptr
+  %p3.cast = addrspacecast ptr addrspace(3) %p3 to ptr
+  %b = icmp eq i32 %val, 0
+  %p = select i1 %b, ptr %p1.cast, ptr %p3.cast
+  %atomic.add = atomicrmw add ptr %p, i32 1 syncscope("agent") seq_cst, align 4, !noalias.addrspace !0, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1
+  ret void
+}
+
+!0 = !{i32 5, i32 6}
+!1 = !{}
+;.
+; CHECK: [[META0]] = !{i32 5, i32 6}
+; CHECK: [[META1]] = !{}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index 201b97d..b372dec 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -16,9 +16,9 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y,
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_andn2_b32 s2, s2, s0
+; GFX7-NEXT:    s_xor_b32 s1, s1, s2
 ; GFX7-NEXT:    s_and_b32 s0, s1, s0
-; GFX7-NEXT:    s_or_b32 s0, s2, s0
+; GFX7-NEXT:    s_xor_b32 s0, s0, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
@@ -28,9 +28,9 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y,
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_andn2_b32 s2, s2, s0
+; GFX8-NEXT:    s_xor_b32 s1, s1, s2
 ; GFX8-NEXT:    s_and_b32 s0, s1, s0
-; GFX8-NEXT:    s_or_b32 s0, s2, s0
+; GFX8-NEXT:    s_xor_b32 s0, s0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
@@ -44,9 +44,9 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y,
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_andn2_b32 s2, s2, s0
+; GFX10-NEXT:    s_xor_b32 s1, s1, s2
 ; GFX10-NEXT:    s_and_b32 s0, s1, s0
-; GFX10-NEXT:    s_or_b32 s0, s2, s0
+; GFX10-NEXT:    s_xor_b32 s0, s0, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
@@ -1407,9 +1407,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
-; GFX7-NEXT:    s_andn2_b64 s[0:1], s[4:5], s[0:1]
-; GFX7-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX7-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX7-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX7-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX7-NEXT:    s_add_u32 s0, s0, 10
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
@@ -1422,9 +1422,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
-; GFX8-NEXT:    s_andn2_b64 s[0:1], s[4:5], s[0:1]
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX8-NEXT:    s_add_u32 s0, s0, 10
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -1438,9 +1438,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
-; GFX10-NEXT:    s_andn2_b64 s[0:1], s[4:5], s[0:1]
-; GFX10-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX10-NEXT:    s_add_u32 s0, s0, 10
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/bit-op-reduce-width-known-bits.ll b/llvm/test/CodeGen/AMDGPU/bit-op-reduce-width-known-bits.ll
new file mode 100644
index 0000000..ad26dfa
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bit-op-reduce-width-known-bits.ll
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; Check for situations where we could reduce the width of bitwise
+; operations.
+
+
+; Should be able to reduce this to a 32-bit or plus a copy
+; https://alive2.llvm.org/ce/z/9LddFX
+define i64 @v_xor_i64_known_hi_i32_from_arg_range(i64 range(i64 0, 4294967296) %arg0, i64 %arg1) {
+; CHECK-LABEL: v_xor_i64_known_hi_i32_from_arg_range:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %xor = xor i64 %arg0, %arg1
+  ret i64 %xor
+}
+
+; Should be able to reduce this to a 32-bit or plus a copy
+; https://alive2.llvm.org/ce/z/HaXnBJ
+define i64 @v_or_i64_known_hi_i32_from_arg_range(i64 range(i64 0, 4294967296) %arg0, i64 %arg1) {
+; CHECK-LABEL: v_or_i64_known_hi_i32_from_arg_range:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_or_b32_e32 v1, v1, v3
+; CHECK-NEXT:    v_or_b32_e32 v0, v0, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %or = or i64 %arg0, %arg1
+  ret i64 %or
+}
+
+; https://alive2.llvm.org/ce/z/M96Ror
+; Should be able to reduce this to a 32-bit plus a copy
+define i64 @v_and_i64_known_i32_from_arg_range(i64 range(i64 -4294967296, 0) %arg0, i64 %arg1) {
+; CHECK-LABEL: v_and_i64_known_i32_from_arg_range:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v1, v1, v3
+; CHECK-NEXT:    v_and_b32_e32 v0, v0, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %and = and i64 %arg0, %arg1
+  ret i64 %and
+}
+
+define i64 @s_xor_i64_known_i32_from_arg_range(i64 range(i64 0, 65) inreg %arg) {
+; CHECK-LABEL: s_xor_i64_known_i32_from_arg_range:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_not_b64 s[4:5], s[16:17]
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    v_mov_b32_e32 v1, s5
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %xor = xor i64 %arg, -1
+  ret i64 %xor
+}
+
+define i64 @v_xor_i64_known_i32_from_call_range() {
+; CHECK-LABEL: v_xor_i64_known_i32_from_call_range:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def v[0:1]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_not_b32_e32 v1, v1
+; CHECK-NEXT:    v_not_b32_e32 v0, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %call = call range(i64 0, 65) i64 asm "; def $0", "=v"()
+  %xor = xor i64 %call, -1
+  ret i64 %xor
+}
+
+define i64 @s_xor_i64_known_i32_from_call_range() {
+; CHECK-LABEL: s_xor_i64_known_i32_from_call_range:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def s[4:5]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_not_b64 s[4:5], s[4:5]
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    v_mov_b32_e32 v1, s5
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %call = call range(i64 0, 65) i64 asm "; def $0", "=s"()
+  %xor = xor i64 %call, -1
+  ret i64 %xor
+}
+
+; Reduced from -amdgpu-codegenprepare-expand-div64 output, produces a
+; not of 0 which ideally would fold out.
+; FIXME: Produces not of constant 0
+define i64 @v_xor_i64_known_i32_from_range_use_out_of_block(i64 %x) {
+; CHECK-LABEL: v_xor_i64_known_i32_from_range_use_out_of_block:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_ffbh_u32_e32 v2, v0
+; CHECK-NEXT:    v_add_u32_e32 v2, 32, v2
+; CHECK-NEXT:    v_ffbh_u32_e32 v3, v1
+; CHECK-NEXT:    v_min_u32_e32 v4, v2, v3
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[0:1]
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT:  ; %bb.1: ; %inc
+; CHECK-NEXT:    v_not_b32_e32 v2, v4
+; CHECK-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
+; CHECK-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; CHECK-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    v_mov_b32_e32 v0, v2
+; CHECK-NEXT:    v_mov_b32_e32 v1, v3
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %ctlz = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 %x, i1 true)
+  %cmp.entry.not = icmp eq i64 %ctlz, %x
+  br i1 %cmp.entry.not, label %inc, label %ret
+
+inc:                                              ; preds = %entry
+  %i1 = xor i64 %ctlz, -1
+  %i2 = add i64 %x, %i1
+  ret i64 %i2
+
+ret:                                              ; preds = %loop, %entry
+  ret i64 0
+}
+
+define i64 @s_xor_i64_known_i32_from_range_use_out_of_block(i64 inreg %x) {
+; CHECK-LABEL: s_xor_i64_known_i32_from_range_use_out_of_block:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_flbit_i32_b64 s4, s[16:17]
+; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_cmp_lg_u64 s[4:5], s[16:17]
+; CHECK-NEXT:    s_cbranch_scc1 .LBB7_2
+; CHECK-NEXT:  ; %bb.1: ; %inc
+; CHECK-NEXT:    s_not_b64 s[4:5], s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s16, s4
+; CHECK-NEXT:    s_addc_u32 s5, s17, s5
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    v_mov_b32_e32 v1, s5
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; CHECK-NEXT:  .LBB7_2: ; %ret
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %ctlz = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 %x, i1 true)
+  %cmp.entry.not = icmp eq i64 %ctlz, %x
+  br i1 %cmp.entry.not, label %inc, label %ret
+
+inc:                                              ; preds = %entry
+  %i1 = xor i64 %ctlz, -1
+  %i2 = add i64 %x, %i1
+  ret i64 %i2
+
+ret:                                              ; preds = %loop, %entry
+  ret i64 0
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
index fe2b0bb..e7177a5 100644
--- a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
@@ -961,3 +961,25 @@ body:             |
     S_ENDPGM 0, implicit %2, implicit %3
 
 ...
+
+---
+name:            constant_v_or_b32_uses_subreg_or_0_regression
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  liveins: $vgpr0, $vgpr1
+
+    ; GCN-LABEL: name: constant_v_or_b32_uses_subreg_or_0_regression
+    ; GCN: liveins: $vgpr0, $vgpr1
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY1]]
+  %0:vgpr_32 = COPY $vgpr0
+  %1:vgpr_32 = COPY $vgpr1
+  %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  %3:vreg_64 = REG_SEQUENCE %2:vgpr_32, %subreg.sub0, %0:vgpr_32, %subreg.sub1
+  %4:vgpr_32 = V_OR_B32_e64 %3.sub0:vreg_64, %1, implicit $exec
+  S_ENDPGM 0, implicit %4
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index e674b57..3304dbf 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -2,6 +2,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN1 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN2 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN3 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_add_i32_offset:
@@ -46,6 +48,21 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_add_i32_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_add_u32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw add ptr %gep, i32 %in syncscope("agent") seq_cst
@@ -95,6 +112,21 @@ define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_add_i32_max_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_add_u32 v[0:1], v2 offset:4092
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 1023
   %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
@@ -146,6 +178,23 @@ define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_add_i32_max_offset_p1:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_add_u32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 1024
   %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
@@ -204,6 +253,22 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_add_i32_ret_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_add_u32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
@@ -266,6 +331,26 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_add_i32_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_add_u32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -337,6 +422,27 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_add_i32_ret_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_add_u32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -384,6 +490,21 @@ define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_add_i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_add_u32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile add ptr %out, i32 %in syncscope("agent") seq_cst
   ret void
@@ -437,6 +558,22 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_add_i32_ret:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_add_u32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile add ptr %out, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
@@ -494,6 +631,26 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_add_i32_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_add_u32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile add ptr %ptr, i32 %in syncscope("agent") seq_cst
@@ -560,6 +717,27 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_add_i32_ret_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_add_u32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile add ptr %ptr, i32 %in syncscope("agent") seq_cst
@@ -610,6 +788,21 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_and_i32_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_and_b32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
@@ -668,6 +861,22 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_and_i32_ret_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_and_b32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
@@ -730,6 +939,26 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_and_i32_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_and_b32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -801,6 +1030,27 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_and_i32_ret_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_and_b32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -848,6 +1098,21 @@ define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_and_i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_and_b32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst
   ret void
@@ -901,6 +1166,22 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_and_i32_ret:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_and_b32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
@@ -958,6 +1239,26 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_and_i32_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_and_b32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst
@@ -1024,6 +1325,27 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_and_i32_ret_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_and_b32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst
@@ -1074,6 +1396,21 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_sub_i32_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_sub_u32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
@@ -1132,6 +1469,22 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_sub_i32_ret_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_sub_u32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
@@ -1194,6 +1547,26 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_sub_i32_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_sub_u32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -1265,6 +1638,27 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_sub_i32_ret_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_sub_u32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -1312,6 +1706,21 @@ define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_sub_i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_sub_u32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst
   ret void
@@ -1365,6 +1774,22 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_sub_i32_ret:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_sub_u32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
@@ -1422,6 +1847,26 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_sub_i32_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_sub_u32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst
@@ -1488,6 +1933,27 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_sub_i32_ret_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_sub_u32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst
@@ -1535,6 +2001,20 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    flat_atomic_smax v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_max_i32_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_max_i32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
@@ -1593,6 +2073,21 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_max_i32_ret_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_max_i32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
@@ -1652,6 +2147,25 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ; GCN3-NEXT:    flat_atomic_smax v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_max_i32_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_max_i32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -1723,6 +2237,26 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_max_i32_ret_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_max_i32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -1767,6 +2301,20 @@ define amdgpu_kernel void @atomic_max_i32(ptr %out, i32 %in) {
 ; GCN3-NEXT:    flat_atomic_smax v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_max_i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_max_i32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst
   ret void
@@ -1820,6 +2368,21 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_max_i32_ret:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_max_i32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst
   store i32 %val, ptr %out2
@@ -1874,6 +2437,25 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    flat_atomic_smax v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_max_i32_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_max_i32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst
@@ -1940,6 +2522,26 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_max_i32_ret_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_max_i32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst
@@ -1987,6 +2589,20 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    flat_atomic_umax v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_umax_i32_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_max_u32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
@@ -2045,6 +2661,21 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 %
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_umax_i32_ret_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_max_u32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
@@ -2104,6 +2735,25 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64
 ; GCN3-NEXT:    flat_atomic_umax v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_umax_i32_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_max_u32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -2175,6 +2825,26 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_umax_i32_ret_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_max_u32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -2219,6 +2889,20 @@ define amdgpu_kernel void @atomic_umax_i32(ptr %out, i32 %in) {
 ; GCN3-NEXT:    flat_atomic_umax v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_umax_i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_max_u32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst
   ret void
@@ -2272,6 +2956,21 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_umax_i32_ret:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_max_u32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst
   store i32 %val, ptr %out2
@@ -2326,6 +3025,25 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    flat_atomic_umax v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_umax_i32_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_max_u32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst
@@ -2392,6 +3110,26 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_umax_i32_ret_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_max_u32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst
@@ -2439,6 +3177,20 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    flat_atomic_smin v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_min_i32_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_min_i32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
@@ -2497,6 +3249,21 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_min_i32_ret_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_min_i32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
@@ -2556,6 +3323,25 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ; GCN3-NEXT:    flat_atomic_smin v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_min_i32_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_min_i32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -2627,6 +3413,26 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_min_i32_ret_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_min_i32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -2671,6 +3477,20 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) {
 ; GCN3-NEXT:    flat_atomic_smin v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_min_i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_min_i32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst
   ret void
@@ -2724,6 +3544,21 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_min_i32_ret:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_min_i32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst
   store i32 %val, ptr %out2
@@ -2778,6 +3613,25 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    flat_atomic_smin v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_min_i32_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_min_i32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst
@@ -2844,6 +3698,26 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_min_i32_ret_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_min_i32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst
@@ -2891,6 +3765,20 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    flat_atomic_umin v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_umin_i32_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_min_u32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
@@ -2949,6 +3837,21 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 %
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_umin_i32_ret_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_min_u32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
@@ -3008,6 +3911,25 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64
 ; GCN3-NEXT:    flat_atomic_umin v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_umin_i32_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_min_u32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -3079,6 +4001,26 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_umin_i32_ret_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_min_u32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -3123,6 +4065,20 @@ define amdgpu_kernel void @atomic_umin_i32(ptr %out, i32 %in) {
 ; GCN3-NEXT:    flat_atomic_umin v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_umin_i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_min_u32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst
   ret void
@@ -3176,6 +4132,21 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_umin_i32_ret:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_min_u32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst
   store i32 %val, ptr %out2
@@ -3230,6 +4201,25 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    flat_atomic_umin v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_umin_i32_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_min_u32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst
@@ -3296,6 +4286,26 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_umin_i32_ret_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_min_u32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst
@@ -3346,6 +4356,21 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_or_i32_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_or_b32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
@@ -3404,6 +4429,22 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_or_i32_ret_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_or_b32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
@@ -3466,6 +4507,26 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_or_i32_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_or_b32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -3537,6 +4598,27 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_or_i32_ret_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_or_b32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -3584,6 +4666,21 @@ define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_or_i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_or_b32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst
   ret void
@@ -3637,6 +4734,22 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_or_i32_ret:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_or_b32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
@@ -3694,6 +4807,26 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_or_i32_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_or_b32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst
@@ -3760,6 +4893,27 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_or_i32_ret_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_or_b32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst
@@ -3810,6 +4964,21 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_xchg_i32_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_swap_b32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst
@@ -3859,6 +5028,21 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_xchg_f32_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_swap_b32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr float, ptr %out, i32 4
   %val = atomicrmw volatile xchg ptr %gep, float %in syncscope("agent") seq_cst
@@ -3917,6 +5101,22 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 %
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_xchg_i32_ret_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst
@@ -3979,6 +5179,26 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_xchg_i32_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_swap_b32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -4050,6 +5270,27 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_xchg_i32_ret_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -4097,6 +5338,21 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_xchg_i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
   ret void
@@ -4150,6 +5406,22 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_xchg_i32_ret:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
@@ -4207,6 +5479,26 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_xchg_i32_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile xchg ptr %ptr, i32 %in syncscope("agent") seq_cst
@@ -4273,6 +5565,27 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_xchg_i32_ret_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile xchg ptr %ptr, i32 %in syncscope("agent") seq_cst
@@ -4325,6 +5638,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_cmpxchg_i32_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -4386,6 +5712,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_cmpxchg_i32_ret_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -4455,6 +5797,27 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_cmpxchg_i32_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x3c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s4
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v[2:3], v[0:1] offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -4532,6 +5895,28 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_load_b32 s9, s[4:5], 0x44
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[2:3], v[0:1] offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -4580,6 +5965,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_cmpxchg_i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = cmpxchg volatile ptr %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   ret void
@@ -4636,6 +6034,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in,
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_cmpxchg_i32_ret:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = cmpxchg volatile ptr %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   %flag = extractvalue { i32, i1 } %val, 0
@@ -4700,6 +6114,27 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_cmpxchg_i32_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x3c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s4
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v[2:3], v[0:1]
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -4772,6 +6207,28 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_cmpxchg_i32_ret_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_load_b32 s9, s[4:5], 0x44
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[2:3], v[0:1] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -4823,6 +6280,21 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_xor_i32_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_xor_b32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
@@ -4881,6 +6353,22 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_xor_i32_ret_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_xor_b32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
@@ -4943,6 +6431,26 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_xor_i32_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_xor_b32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -5014,6 +6522,27 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_xor_i32_ret_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_xor_b32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -5061,6 +6590,21 @@ define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_xor_i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_xor_b32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst
   ret void
@@ -5114,6 +6658,22 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_xor_i32_ret:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_xor_b32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
@@ -5171,6 +6731,26 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_xor_i32_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_xor_b32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst
@@ -5237,6 +6817,27 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_xor_i32_ret_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_xor_b32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst
@@ -5290,6 +6891,19 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_load_i32_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    flat_load_b32 v2, v[0:1] offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %in, i32 4
   %val = load atomic i32, ptr %gep  seq_cst, align 4
@@ -5339,6 +6953,19 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_load_i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    flat_load_b32 v2, v[0:1] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = load atomic i32, ptr %in seq_cst, align 4
   store i32 %val, ptr %out
@@ -5403,6 +7030,25 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_load_i32_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    flat_load_b32 v2, v[0:1] offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %in, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -5465,6 +7111,25 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index)
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_load_i32_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    flat_load_b32 v2, v[0:1] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %in, i64 %index
   %val = load atomic i32, ptr %ptr seq_cst, align 4
@@ -5509,6 +7174,17 @@ define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_store_i32_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   store atomic i32 %in, ptr %gep  seq_cst, align 4
@@ -5548,6 +7224,17 @@ define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_store_i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   store atomic i32 %in, ptr %out seq_cst, align 4
   ret void
@@ -5599,6 +7286,21 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_store_i32_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -5648,6 +7350,21 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_store_i32_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   store atomic i32 %in, ptr %ptr seq_cst, align 4
@@ -5700,6 +7417,19 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_load_f32_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    flat_load_b32 v2, v[0:1] offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr float, ptr %in, i32 4
   %val = load atomic float, ptr %gep  seq_cst, align 4
@@ -5749,6 +7479,19 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_load_f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    flat_load_b32 v2, v[0:1] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = load atomic float, ptr %in seq_cst, align 4
   store float %val, ptr %out
@@ -5813,6 +7556,25 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_load_f32_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    flat_load_b32 v2, v[0:1] offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr float, ptr %in, i64 %index
   %gep = getelementptr float, ptr %ptr, i32 4
@@ -5875,6 +7637,25 @@ define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index)
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_load_f32_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    flat_load_b32 v2, v[0:1] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr float, ptr %in, i64 %index
   %val = load atomic float, ptr %ptr seq_cst, align 4
@@ -5919,6 +7700,17 @@ define amdgpu_kernel void @atomic_store_f32_offset(float %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_store_f32_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr float, ptr %out, i32 4
   store atomic float %in, ptr %gep  seq_cst, align 4
@@ -5958,6 +7750,17 @@ define amdgpu_kernel void @atomic_store_f32(float %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_store_f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   store atomic float %in, ptr %out seq_cst, align 4
   ret void
@@ -6009,6 +7812,21 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_store_f32_addr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr float, ptr %out, i64 %index
   %gep = getelementptr float, ptr %ptr, i32 4
@@ -6058,6 +7876,21 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %ind
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_store_f32_addr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr float, ptr %out, i64 %index
   store atomic float %in, ptr %ptr seq_cst, align 4
@@ -6110,6 +7943,33 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_byte v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: atomic_load_i8_offset:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_gl1_inv
+; GFX11-TRUE16-NEXT:    buffer_gl0_inv
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-TRUE16-NEXT:    flat_store_b8 v[1:2], v0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_load_i8_offset:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    flat_load_u8 v2, v[0:1] offset:16 glc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_gl1_inv
+; GFX11-FAKE16-NEXT:    buffer_gl0_inv
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-FAKE16-NEXT:    flat_store_b8 v[0:1], v2
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr %in, i64 16
   %val = load atomic i8, ptr %gep  seq_cst, align 1
@@ -6159,6 +8019,33 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_byte v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: atomic_load_i8:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_gl1_inv
+; GFX11-TRUE16-NEXT:    buffer_gl0_inv
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-TRUE16-NEXT:    flat_store_b8 v[1:2], v0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_load_i8:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    flat_load_u8 v2, v[0:1] glc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_gl1_inv
+; GFX11-FAKE16-NEXT:    buffer_gl0_inv
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-FAKE16-NEXT:    flat_store_b8 v[0:1], v2
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %val = load atomic i8, ptr %in seq_cst, align 1
   store i8 %val, ptr %out
@@ -6220,6 +8107,43 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 %
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_byte v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: atomic_load_i8_addr64_offset:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-TRUE16-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-TRUE16-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_gl1_inv
+; GFX11-TRUE16-NEXT:    buffer_gl0_inv
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-TRUE16-NEXT:    flat_store_b8 v[1:2], v0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_load_i8_addr64_offset:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-FAKE16-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    flat_load_u8 v2, v[0:1] offset:16 glc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_gl1_inv
+; GFX11-FAKE16-NEXT:    buffer_gl0_inv
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-FAKE16-NEXT:    flat_store_b8 v[0:1], v2
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i8, ptr %in, i64 %index
   %gep = getelementptr i8, ptr %ptr, i64 16
@@ -6265,6 +8189,28 @@ define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN3-NEXT:    flat_store_byte v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: atomic_store_i8_offset:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    flat_store_b8 v[1:2], v0 offset:16
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_store_i8_offset:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-FAKE16-NEXT:    flat_store_b8 v[0:1], v2 offset:16
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr %out, i64 16
   store atomic i8 %in, ptr %gep  seq_cst, align 1
@@ -6304,6 +8250,28 @@ define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN3-NEXT:    flat_store_byte v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: atomic_store_i8:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    flat_store_b8 v[1:2], v0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_store_i8:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-FAKE16-NEXT:    flat_store_b8 v[0:1], v2
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   store atomic i8 %in, ptr %out seq_cst, align 1
   ret void
@@ -6352,6 +8320,33 @@ define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 %
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN3-NEXT:    flat_store_byte v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: atomic_store_i8_addr64_offset:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    s_load_b32 s4, s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-TRUE16-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-TRUE16-NEXT:    flat_store_b8 v[1:2], v0 offset:16
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_store_i8_addr64_offset:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_load_b32 s4, s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-FAKE16-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-FAKE16-NEXT:    flat_store_b8 v[0:1], v2 offset:16
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i8, ptr %out, i64 %index
   %gep = getelementptr i8, ptr %ptr, i64 16
@@ -6405,6 +8400,33 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: atomic_load_i16_offset:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-TRUE16-NEXT:    flat_load_d16_b16 v0, v[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_gl1_inv
+; GFX11-TRUE16-NEXT:    buffer_gl0_inv
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-TRUE16-NEXT:    flat_store_b16 v[1:2], v0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_load_i16_offset:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    flat_load_u16 v2, v[0:1] offset:16 glc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_gl1_inv
+; GFX11-FAKE16-NEXT:    buffer_gl0_inv
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-FAKE16-NEXT:    flat_store_b16 v[0:1], v2
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i16, ptr %in, i64 8
   %val = load atomic i16, ptr %gep  seq_cst, align 2
@@ -6454,6 +8476,33 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: atomic_load_i16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-TRUE16-NEXT:    flat_load_d16_b16 v0, v[0:1] glc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_gl1_inv
+; GFX11-TRUE16-NEXT:    buffer_gl0_inv
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-TRUE16-NEXT:    flat_store_b16 v[1:2], v0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_load_i16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    flat_load_u16 v2, v[0:1] glc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_gl1_inv
+; GFX11-FAKE16-NEXT:    buffer_gl0_inv
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-FAKE16-NEXT:    flat_store_b16 v[0:1], v2
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %val = load atomic i16, ptr %in seq_cst, align 2
   store i16 %val, ptr %out
@@ -6518,6 +8567,45 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: atomic_load_i16_addr64_offset:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_lshl_b64 s[4:5], s[6:7], 1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-TRUE16-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-TRUE16-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    flat_load_d16_b16 v0, v[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_gl1_inv
+; GFX11-TRUE16-NEXT:    buffer_gl0_inv
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-TRUE16-NEXT:    flat_store_b16 v[1:2], v0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_load_i16_addr64_offset:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_lshl_b64 s[4:5], s[6:7], 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-FAKE16-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    flat_load_u16 v2, v[0:1] offset:16 glc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_gl1_inv
+; GFX11-FAKE16-NEXT:    buffer_gl0_inv
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-FAKE16-NEXT:    flat_store_b16 v[0:1], v2
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i16, ptr %in, i64 %index
   %gep = getelementptr i16, ptr %ptr, i64 8
@@ -6563,6 +8651,28 @@ define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN3-NEXT:    flat_store_short v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: atomic_store_i16_offset:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    flat_store_b16 v[1:2], v0 offset:16
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_store_i16_offset:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-FAKE16-NEXT:    flat_store_b16 v[0:1], v2 offset:16
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i16, ptr %out, i64 8
   store atomic i16 %in, ptr %gep  seq_cst, align 2
@@ -6602,6 +8712,28 @@ define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: atomic_store_i16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    flat_store_b16 v[1:2], v0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_store_i16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-FAKE16-NEXT:    flat_store_b16 v[0:1], v2
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   store atomic i16 %in, ptr %out seq_cst, align 2
   ret void
@@ -6653,6 +8785,36 @@ define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN3-NEXT:    flat_store_short v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: atomic_store_i16_addr64_offset:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    s_load_b32 s4, s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s4
+; GFX11-TRUE16-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-TRUE16-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-TRUE16-NEXT:    flat_store_b16 v[1:2], v0 offset:16
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_store_i16_addr64_offset:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_load_b32 s4, s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-FAKE16-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-FAKE16-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    flat_store_b16 v[0:1], v2 offset:16
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i16, ptr %out, i64 %index
   %gep = getelementptr i16, ptr %ptr, i64 8
@@ -6697,6 +8859,28 @@ define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN3-NEXT:    flat_store_short v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: atomic_store_f16_offset:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    flat_store_b16 v[1:2], v0 offset:16
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_store_f16_offset:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-FAKE16-NEXT:    flat_store_b16 v[0:1], v2 offset:16
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %gep = getelementptr half, ptr %out, i64 8
   store atomic half %in, ptr %gep  seq_cst, align 2
@@ -6736,6 +8920,28 @@ define amdgpu_kernel void @atomic_store_f16(half %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: atomic_store_f16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    flat_store_b16 v[1:2], v0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_store_f16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-FAKE16-NEXT:    flat_store_b16 v[0:1], v2
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   store atomic half %in, ptr %out seq_cst, align 2
   ret void
@@ -6774,6 +8980,28 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: atomic_store_bf16_offset:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    flat_store_b16 v[1:2], v0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_store_bf16_offset:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-FAKE16-NEXT:    flat_store_b16 v[0:1], v2
+; GFX11-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr bfloat, ptr %out, i64 8
   store atomic bfloat %in, ptr %out seq_cst, align 2
   ret void
@@ -6812,6 +9040,28 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: atomic_store_bf16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    flat_store_b16 v[1:2], v0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_store_bf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-FAKE16-NEXT:    flat_store_b16 v[0:1], v2
+; GFX11-FAKE16-NEXT:    s_endpgm
   store atomic bfloat %in, ptr %out seq_cst, align 2
   ret void
 }
@@ -6859,6 +9109,21 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_inc_i32_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_inc_u32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
@@ -6908,6 +9173,21 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_inc_i32_max_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_inc_u32 v[0:1], v2 offset:4092
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 1023
   %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
@@ -6959,6 +9239,23 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_inc_i32_max_offset_p1:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_inc_u32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 1024
   %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
@@ -7017,6 +9314,22 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_inc_i32_ret_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
@@ -7079,6 +9392,26 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 %
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_inc_i32_incr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_inc_u32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -7150,6 +9483,27 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2,
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_inc_i32_ret_incr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -7197,6 +9551,21 @@ define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_inc_i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_inc_u32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst
   ret void
@@ -7250,6 +9619,22 @@ define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_inc_i32_ret:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_inc_u32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
@@ -7307,6 +9692,26 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_inc_i32_incr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_inc_u32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
@@ -7373,6 +9778,27 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_inc_i32_ret_incr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_inc_u32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
@@ -7423,6 +9849,21 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_dec_i32_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_dec_u32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
@@ -7472,6 +9913,21 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_dec_i32_max_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_dec_u32 v[0:1], v2 offset:4092
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 1023
   %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
@@ -7523,6 +9979,23 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_dec_i32_max_offset_p1:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_dec_u32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 1024
   %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
@@ -7581,6 +10054,22 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_dec_i32_ret_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
   %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
@@ -7643,6 +10132,26 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 %
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_dec_i32_decr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_dec_u32 v[0:1], v2 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -7714,6 +10223,27 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2,
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_dec_i32_ret_decr64_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
@@ -7761,6 +10291,21 @@ define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_dec_i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_atomic_dec_u32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst
   ret void
@@ -7814,6 +10359,22 @@ define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_dec_i32_ret:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_dec_u32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
@@ -7871,6 +10432,26 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_dec_i32_decr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    flat_atomic_dec_u32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
@@ -7937,6 +10518,27 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_dec_i32_ret_decr64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, s4
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11-NEXT:    flat_atomic_dec_u32 v2, v[0:1], v2 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
@@ -7990,6 +10592,33 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: atomic_load_f16_offset:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-TRUE16-NEXT:    flat_load_d16_b16 v0, v[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_gl1_inv
+; GFX11-TRUE16-NEXT:    buffer_gl0_inv
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-TRUE16-NEXT:    flat_store_b16 v[1:2], v0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_load_f16_offset:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    flat_load_u16 v2, v[0:1] offset:16 glc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_gl1_inv
+; GFX11-FAKE16-NEXT:    buffer_gl0_inv
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-FAKE16-NEXT:    flat_store_b16 v[0:1], v2
+; GFX11-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr half, ptr %in, i64 8
   %val = load atomic half, ptr %gep  seq_cst, align 2
   store half %val, ptr %out
@@ -8038,6 +10667,33 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: atomic_load_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-TRUE16-NEXT:    flat_load_d16_b16 v0, v[0:1] glc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_gl1_inv
+; GFX11-TRUE16-NEXT:    buffer_gl0_inv
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-TRUE16-NEXT:    flat_store_b16 v[1:2], v0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_load_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    flat_load_u16 v2, v[0:1] glc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_gl1_inv
+; GFX11-FAKE16-NEXT:    buffer_gl0_inv
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-FAKE16-NEXT:    flat_store_b16 v[0:1], v2
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load atomic half, ptr %in seq_cst, align 2
   store half %val, ptr %out
   ret void
@@ -8089,6 +10745,33 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: atomic_load_bf16_offset:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-TRUE16-NEXT:    flat_load_d16_b16 v0, v[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_gl1_inv
+; GFX11-TRUE16-NEXT:    buffer_gl0_inv
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-TRUE16-NEXT:    flat_store_b16 v[1:2], v0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_load_bf16_offset:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    flat_load_u16 v2, v[0:1] offset:16 glc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_gl1_inv
+; GFX11-FAKE16-NEXT:    buffer_gl0_inv
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-FAKE16-NEXT:    flat_store_b16 v[0:1], v2
+; GFX11-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr bfloat, ptr %in, i64 8
   %val = load atomic bfloat, ptr %gep  seq_cst, align 2
   store bfloat %val, ptr %out
@@ -8137,6 +10820,33 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) {
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: atomic_load_bf16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-TRUE16-NEXT:    flat_load_d16_b16 v0, v[0:1] glc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_gl1_inv
+; GFX11-TRUE16-NEXT:    buffer_gl0_inv
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-TRUE16-NEXT:    flat_store_b16 v[1:2], v0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_load_bf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    flat_load_u16 v2, v[0:1] glc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_gl1_inv
+; GFX11-FAKE16-NEXT:    buffer_gl0_inv
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-FAKE16-NEXT:    flat_store_b16 v[0:1], v2
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load atomic bfloat, ptr %in seq_cst, align 2
   store bfloat %val, ptr %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir
index d00fd9b..74c4a2d 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir
@@ -43,8 +43,7 @@ body:             |
     ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[DEF]], %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
-    ; GCN-NEXT: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 0, [[DEF1]], implicit $exec
-    ; GCN-NEXT: [[V_XOR_B32_e32_1:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 [[DEF2]], [[REG_SEQUENCE]].sub0, implicit $exec
+    ; GCN-NEXT: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 [[DEF2]], [[REG_SEQUENCE]].sub0, implicit $exec
     %0:vgpr_32 = IMPLICIT_DEF
     %1:vgpr_32 = IMPLICIT_DEF
     %2:vgpr_32 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/fold-zero-high-bits-skips-non-reg.mir b/llvm/test/CodeGen/AMDGPU/fold-zero-high-bits-skips-non-reg.mir
index b1aa889..dc03eb7 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-zero-high-bits-skips-non-reg.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-zero-high-bits-skips-non-reg.mir
@@ -8,8 +8,8 @@ body: |
     ; CHECK-LABEL: name: test_tryFoldZeroHighBits_skips_nonreg
     ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
-    ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 65535, 0, implicit $exec
-    ; CHECK-NEXT: S_NOP 0, implicit [[V_AND_B32_e64_]]
+    ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_1]]
   %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   %1:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1
   %2:vgpr_32 = V_AND_B32_e64 65535, %1.sub0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index 60ef493..0512b9b 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -7589,15 +7589,26 @@ define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr addrspace(1) %out)
 ; GFX9-NEXT:    global_store_byte v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: atomic_store_i8_offset:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x24
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT:    global_store_b8 v0, v1, s[0:1] offset:16
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: atomic_store_i8_offset:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    global_store_b8 v1, v0, s[0:1] offset:16
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_store_i8_offset:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT:    global_store_b8 v0, v1, s[0:1] offset:16
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(1) %out, i64 16
   store atomic i8 %in, ptr addrspace(1) %gep  seq_cst, align 1
@@ -7637,15 +7648,26 @@ define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr addrspace(1) %out) {
 ; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: atomic_store_i8:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x24
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT:    global_store_b8 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: atomic_store_i8:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_store_i8:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT:    global_store_b8 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   store atomic i8 %in, ptr addrspace(1) %out seq_cst, align 1
   ret void
@@ -7700,7 +7722,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] offset:16 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -7778,7 +7800,7 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] offset:-512 glc
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] offset:-512 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -7838,15 +7860,26 @@ define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr addrspace(1) %ou
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: atomic_store_i16_offset:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x24
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1] offset:16
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: atomic_store_i16_offset:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1] offset:16
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_store_i16_offset:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1] offset:16
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i16, ptr addrspace(1) %out, i64 8
   store atomic i16 %in, ptr addrspace(1) %gep  seq_cst, align 2
@@ -7886,15 +7919,26 @@ define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr addrspace(1) %out) {
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: atomic_store_i16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x24
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: atomic_store_i16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_store_i16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   store atomic i16 %in, ptr addrspace(1) %out seq_cst, align 2
   ret void
@@ -7935,15 +7979,26 @@ define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr addrspace(1) %o
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: atomic_store_f16_offset:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x24
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1] offset:16
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: atomic_store_f16_offset:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1] offset:16
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_store_f16_offset:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1] offset:16
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %gep = getelementptr half, ptr addrspace(1) %out, i64 8
   store atomic half %in, ptr addrspace(1) %gep  seq_cst, align 2
@@ -7983,15 +8038,26 @@ define amdgpu_kernel void @atomic_store_f16(half %in, ptr addrspace(1) %out) {
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: atomic_store_f16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x24
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: atomic_store_f16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_store_f16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   store atomic half %in, ptr addrspace(1) %out seq_cst, align 2
   ret void
@@ -8032,15 +8098,26 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1)
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: atomic_store_bf16_offset:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x24
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1] offset:16
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: atomic_store_bf16_offset:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1] offset:16
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_store_bf16_offset:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1] offset:16
+; GFX11-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr bfloat, ptr addrspace(1) %out, i64 8
   store atomic bfloat %in, ptr addrspace(1) %gep seq_cst, align 2
   ret void
@@ -8079,15 +8156,26 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out)
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: atomic_store_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x24
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: atomic_store_bf16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: atomic_store_bf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   store atomic bfloat %in, ptr addrspace(1) %out seq_cst, align 2
   ret void
 }
@@ -9099,7 +9187,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] offset:16 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -9176,7 +9264,7 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] offset:-512 glc
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] offset:-512 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -9249,7 +9337,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] offset:16 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -9326,7 +9414,7 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] offset:-512 glc
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] offset:-512 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 6925a98..e1b4cad 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -289,16 +289,16 @@ entry:
 define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) {
 ; GCN-LABEL: half4_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
 ; GCN-NEXT:    s_mov_b32 s4, 0x3c003c00
 ; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
 ; GCN-NEXT:    s_lshl_b32 s6, s6, 4
 ; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
-; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
-; GCN-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
@@ -317,10 +317,10 @@ define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec,
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 4
+; GCN-NEXT:    s_xor_b32 s4, s2, 0x3c003c00
 ; GCN-NEXT:    s_lshl_b32 s3, 0xffff, s3
-; GCN-NEXT:    s_andn2_b32 s2, s2, s3
-; GCN-NEXT:    s_and_b32 s3, s3, 0x3c003c00
-; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s3, s4, s3
+; GCN-NEXT:    s_xor_b32 s2, s3, s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
@@ -399,10 +399,10 @@ define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec,
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 4
+; GCN-NEXT:    s_xor_b32 s4, s2, 0x10001
 ; GCN-NEXT:    s_lshl_b32 s3, 0xffff, s3
-; GCN-NEXT:    s_andn2_b32 s2, s2, s3
-; GCN-NEXT:    s_and_b32 s3, s3, 0x10001
-; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s3, s4, s3
+; GCN-NEXT:    s_xor_b32 s2, s3, s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
@@ -417,16 +417,16 @@ entry:
 define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) {
 ; GCN-LABEL: short4_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
 ; GCN-NEXT:    s_mov_b32 s4, 0x10001
 ; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
 ; GCN-NEXT:    s_lshl_b32 s6, s6, 4
 ; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
-; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
-; GCN-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
@@ -442,15 +442,15 @@ entry:
 define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) {
 ; GCN-LABEL: byte8_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s4, s6, 3
-; GCN-NEXT:    s_lshl_b64 s[4:5], 0xff, s4
-; GCN-NEXT:    s_and_b32 s7, s5, 0x1010101
-; GCN-NEXT:    s_and_b32 s6, s4, 0x1010101
-; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GCN-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
+; GCN-NEXT:    s_xor_b32 s5, s3, 0x1010101
+; GCN-NEXT:    s_lshl_b32 s6, s6, 3
+; GCN-NEXT:    s_xor_b32 s4, s2, 0x1010101
+; GCN-NEXT:    s_lshl_b64 s[6:7], 0xff, s6
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index be16fac..44bd409 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1511,13 +1511,13 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2
 ; SI-NEXT:    s_mov_b32 s7, 0x100f000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_lshl_b32 s0, s3, 4
-; SI-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_andn2_b32 s1, s2, s0
-; SI-NEXT:    s_and_b32 s0, s0, 0x50005
-; SI-NEXT:    s_or_b32 s0, s0, s1
+; SI-NEXT:    s_lshl_b32 s1, s3, 4
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_xor_b32 s0, s2, 0x50005
+; SI-NEXT:    s_lshl_b32 s1, 0xffff, s1
+; SI-NEXT:    s_and_b32 s0, s0, s1
+; SI-NEXT:    s_xor_b32 s0, s0, s2
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -1528,13 +1528,13 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2
 ; VI-NEXT:    s_mov_b32 s7, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_lshl_b32 s0, s3, 4
-; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_andn2_b32 s1, s2, s0
-; VI-NEXT:    s_and_b32 s0, s0, 0x50005
-; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    s_lshl_b32 s1, s3, 4
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_xor_b32 s0, s2, 0x50005
+; VI-NEXT:    s_lshl_b32 s1, 0xffff, s1
+; VI-NEXT:    s_and_b32 s0, s0, s1
+; VI-NEXT:    s_xor_b32 s0, s0, s2
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
@@ -1552,13 +1552,13 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_lshl_b32 s0, s8, 4
+; SI-NEXT:    s_lshl_b32 s8, s8, 4
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s0
-; SI-NEXT:    s_and_b32 s9, s1, 0x50005
-; SI-NEXT:    s_and_b32 s8, s0, 0x50005
-; SI-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[0:1]
-; SI-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; SI-NEXT:    s_xor_b32 s1, s3, 0x50005
+; SI-NEXT:    s_xor_b32 s0, s2, 0x50005
+; SI-NEXT:    s_lshl_b64 s[8:9], 0xffff, s8
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
+; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1573,14 +1573,14 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_lshl_b32 s0, s8, 4
-; VI-NEXT:    s_mov_b32 s8, 0x50005
+; VI-NEXT:    s_mov_b32 s0, 0x50005
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s0
-; VI-NEXT:    s_mov_b32 s9, s8
-; VI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; VI-NEXT:    s_mov_b32 s1, s0
+; VI-NEXT:    s_lshl_b32 s8, s8, 4
+; VI-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
+; VI-NEXT:    s_lshl_b64 s[8:9], 0xffff, s8
 ; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
-; VI-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s1
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1594,35 +1594,34 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
 define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v2i8:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s4, s[8:9], 0x13
+; SI-NEXT:    s_load_dword s4, s[8:9], 0xa
+; SI-NEXT:    s_load_dword s5, s[8:9], 0x13
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT:    s_load_dword s5, s[8:9], 0xa
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s4, s4, 3
-; SI-NEXT:    s_lshl_b32 s4, 0xff, s4
-; SI-NEXT:    s_andn2_b32 s5, s5, s4
-; SI-NEXT:    s_and_b32 s4, s4, 0x505
-; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_xor_b32 s6, s4, 0x505
+; SI-NEXT:    s_lshl_b32 s5, s5, 3
+; SI-NEXT:    s_lshl_b32 s5, 0xff, s5
+; SI-NEXT:    s_and_b32 s5, s6, s5
+; SI-NEXT:    s_xor_b32 s4, s5, s4
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_insertelement_v2i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s4, s[8:9], 0x4c
+; VI-NEXT:    s_load_dword s4, s[8:9], 0x28
+; VI-NEXT:    s_load_dword s5, s[8:9], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT:    s_load_dword s5, s[8:9], 0x28
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s4, s4, 3
-; VI-NEXT:    s_lshl_b32 s4, 0xff, s4
-; VI-NEXT:    s_and_b32 s6, s4, 0x505
-; VI-NEXT:    s_xor_b32 s4, s4, 0xffff
-; VI-NEXT:    s_and_b32 s4, s4, s5
-; VI-NEXT:    s_or_b32 s4, s6, s4
+; VI-NEXT:    s_xor_b32 s6, s4, 0x505
+; VI-NEXT:    s_lshl_b32 s5, s5, 3
+; VI-NEXT:    s_lshl_b32 s5, 0xff, s5
+; VI-NEXT:    s_and_b32 s5, s6, s5
+; VI-NEXT:    s_xor_b32 s4, s5, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
@@ -1636,17 +1635,17 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8
 define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v3i8:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s4, s[8:9], 0x13
+; SI-NEXT:    s_load_dword s4, s[8:9], 0xa
+; SI-NEXT:    s_load_dword s5, s[8:9], 0x13
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT:    s_load_dword s5, s[8:9], 0xa
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s4, s4, 3
-; SI-NEXT:    s_lshl_b32 s4, 0xff, s4
-; SI-NEXT:    s_andn2_b32 s5, s5, s4
-; SI-NEXT:    s_and_b32 s4, s4, 0x5050505
-; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_xor_b32 s6, s4, 0x5050505
+; SI-NEXT:    s_lshl_b32 s5, s5, 3
+; SI-NEXT:    s_lshl_b32 s5, 0xff, s5
+; SI-NEXT:    s_and_b32 s5, s6, s5
+; SI-NEXT:    s_xor_b32 s4, s5, s4
 ; SI-NEXT:    s_lshr_b32 s5, s4, 16
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -1656,17 +1655,17 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8
 ;
 ; VI-LABEL: dynamic_insertelement_v3i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s4, s[8:9], 0x4c
+; VI-NEXT:    s_load_dword s4, s[8:9], 0x28
+; VI-NEXT:    s_load_dword s5, s[8:9], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT:    s_load_dword s5, s[8:9], 0x28
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s4, s4, 3
-; VI-NEXT:    s_lshl_b32 s4, 0xff, s4
-; VI-NEXT:    s_andn2_b32 s5, s5, s4
-; VI-NEXT:    s_and_b32 s4, s4, 0x5050505
-; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_xor_b32 s6, s4, 0x5050505
+; VI-NEXT:    s_lshl_b32 s5, s5, 3
+; VI-NEXT:    s_lshl_b32 s5, 0xff, s5
+; VI-NEXT:    s_and_b32 s5, s6, s5
+; VI-NEXT:    s_xor_b32 s4, s5, s4
 ; VI-NEXT:    s_lshr_b32 s5, s4, 16
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -1681,34 +1680,34 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8
 define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v4i8:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s4, s[8:9], 0x13
+; SI-NEXT:    s_load_dword s4, s[8:9], 0xa
+; SI-NEXT:    s_load_dword s5, s[8:9], 0x13
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT:    s_load_dword s5, s[8:9], 0xa
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s4, s4, 3
-; SI-NEXT:    s_lshl_b32 s4, 0xff, s4
-; SI-NEXT:    s_andn2_b32 s5, s5, s4
-; SI-NEXT:    s_and_b32 s4, s4, 0x5050505
-; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_xor_b32 s6, s4, 0x5050505
+; SI-NEXT:    s_lshl_b32 s5, s5, 3
+; SI-NEXT:    s_lshl_b32 s5, 0xff, s5
+; SI-NEXT:    s_and_b32 s5, s6, s5
+; SI-NEXT:    s_xor_b32 s4, s5, s4
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_insertelement_v4i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s4, s[8:9], 0x4c
+; VI-NEXT:    s_load_dword s4, s[8:9], 0x28
+; VI-NEXT:    s_load_dword s5, s[8:9], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT:    s_load_dword s5, s[8:9], 0x28
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s4, s4, 3
-; VI-NEXT:    s_lshl_b32 s4, 0xff, s4
-; VI-NEXT:    s_andn2_b32 s5, s5, s4
-; VI-NEXT:    s_and_b32 s4, s4, 0x5050505
-; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_xor_b32 s6, s4, 0x5050505
+; VI-NEXT:    s_lshl_b32 s5, s5, 3
+; VI-NEXT:    s_lshl_b32 s5, 0xff, s5
+; VI-NEXT:    s_and_b32 s5, s6, s5
+; VI-NEXT:    s_xor_b32 s4, s5, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
@@ -1721,20 +1720,20 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p
 ; SI-LABEL: s_dynamic_insertelement_v8i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; SI-NEXT:    s_load_dword s8, s[8:9], 0x4
 ; SI-NEXT:    s_mov_b32 s7, 0x100f000
 ; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_load_dword s8, s[8:9], 0x4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_lshl_b32 s0, s8, 3
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_lshl_b64 s[0:1], 0xff, s0
-; SI-NEXT:    s_and_b32 s9, s1, 0x5050505
+; SI-NEXT:    s_lshl_b32 s8, s8, 3
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
-; SI-NEXT:    s_and_b32 s8, s0, 0x5050505
-; SI-NEXT:    s_or_b64 s[0:1], s[8:9], s[2:3]
+; SI-NEXT:    s_xor_b32 s1, s3, 0x5050505
+; SI-NEXT:    s_xor_b32 s0, s2, 0x5050505
+; SI-NEXT:    s_lshl_b64 s[8:9], 0xff, s8
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
+; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1743,20 +1742,20 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p
 ; VI-LABEL: s_dynamic_insertelement_v8i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT:    s_load_dword s8, s[8:9], 0x10
 ; VI-NEXT:    s_mov_b32 s7, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dword s8, s[8:9], 0x10
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_lshl_b32 s0, s8, 3
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshl_b64 s[0:1], 0xff, s0
-; VI-NEXT:    s_and_b32 s9, s1, 0x5050505
+; VI-NEXT:    s_lshl_b32 s8, s8, 3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
-; VI-NEXT:    s_and_b32 s8, s0, 0x5050505
-; VI-NEXT:    s_or_b64 s[0:1], s[8:9], s[2:3]
+; VI-NEXT:    s_xor_b32 s1, s3, 0x5050505
+; VI-NEXT:    s_xor_b32 s0, s2, 0x5050505
+; VI-NEXT:    s_lshl_b64 s[8:9], 0xff, s8
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
+; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index e0dacb7..a0ad632 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -1534,11 +1534,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s7, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s2, s6, 4
-; GFX9-NEXT:    s_lshl_b32 s2, 0xffff, s2
-; GFX9-NEXT:    s_andn2_b32 s3, s7, s2
-; GFX9-NEXT:    s_and_b32 s2, s2, 0x3e703e7
-; GFX9-NEXT:    s_or_b32 s2, s2, s3
+; GFX9-NEXT:    s_lshl_b32 s3, s6, 4
+; GFX9-NEXT:    s_xor_b32 s2, s7, 0x3e703e7
+; GFX9-NEXT:    s_lshl_b32 s3, 0xffff, s3
+; GFX9-NEXT:    s_and_b32 s2, s2, s3
+; GFX9-NEXT:    s_xor_b32 s2, s2, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -1553,14 +1553,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s0, s4, 4
-; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
-; VI-NEXT:    s_andn2_b32 s1, s2, s0
-; VI-NEXT:    s_and_b32 s0, s0, 0x3e703e7
-; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    s_lshl_b32 s1, s4, 4
+; VI-NEXT:    s_xor_b32 s0, s2, 0x3e703e7
+; VI-NEXT:    s_lshl_b32 s1, 0xffff, s1
+; VI-NEXT:    s_and_b32 s0, s0, s1
+; VI-NEXT:    s_xor_b32 s0, s0, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -1575,14 +1575,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_lshl_b32 s0, s4, 4
-; CI-NEXT:    s_lshl_b32 s0, 0xffff, s0
-; CI-NEXT:    s_andn2_b32 s1, s2, s0
-; CI-NEXT:    s_and_b32 s0, s0, 0x3e703e7
-; CI-NEXT:    s_or_b32 s0, s0, s1
+; CI-NEXT:    s_lshl_b32 s1, s4, 4
+; CI-NEXT:    s_xor_b32 s0, s2, 0x3e703e7
+; CI-NEXT:    s_lshl_b32 s1, 0xffff, s1
+; CI-NEXT:    s_and_b32 s0, s0, s1
+; CI-NEXT:    s_xor_b32 s0, s0, s2
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -1597,12 +1597,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_lshl_b32 s3, s4, 4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_xor_b32 s4, s2, 0x3e703e7
 ; GFX11-NEXT:    s_lshl_b32 s3, 0xffff, s3
-; GFX11-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX11-NEXT:    s_and_b32 s3, s3, 0x3e703e7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    s_and_b32 s3, s4, s3
+; GFX11-NEXT:    s_xor_b32 s2, s3, s2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 15eb41a..df49625 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -404,12 +404,11 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v16, vcc, -1, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v17, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_not_b32_e32 v5, v10
+; GCN-IR-NEXT:    v_not_b32_e32 v4, v10
 ; GCN-IR-NEXT:    v_lshr_b64 v[8:9], v[6:7], v8
-; GCN-IR-NEXT:    v_not_b32_e32 v4, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, v5, v11
+; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, v4, v11
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v4, vcc
+; GCN-IR-NEXT:    v_addc_u32_e64 v7, s[4:5], -1, 0, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index c729c3f..47dfa9f 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -380,12 +380,11 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v16, vcc, -1, v2
 ; GCN-IR-NEXT:    v_addc_u32_e32 v17, vcc, -1, v3, vcc
-; GCN-IR-NEXT:    v_not_b32_e32 v7, v12
+; GCN-IR-NEXT:    v_not_b32_e32 v6, v12
 ; GCN-IR-NEXT:    v_lshr_b64 v[10:11], v[0:1], v8
-; GCN-IR-NEXT:    v_not_b32_e32 v6, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, v7, v13
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, v6, v13
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v6, vcc
+; GCN-IR-NEXT:    v_addc_u32_e64 v9, s[4:5], -1, 0, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 5acbb04..e901793 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -348,10 +348,9 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_lshr_b64 v[8:9], v[0:1], v10
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v3, vcc
 ; GCN-IR-NEXT:    v_not_b32_e32 v0, v14
-; GCN-IR-NEXT:    v_not_b32_e32 v1, 0
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v15
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_addc_u32_e64 v1, s[4:5], -1, 0, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
diff --git a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
index 69724aa..321b645 100644
--- a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
@@ -5,10 +5,11 @@ define i32 @s_out32(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
 ; GCN-LABEL: s_out32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s0, s0, s2
-; GCN-NEXT:    s_and_not1_b32 s1, s1, s2
+; GCN-NEXT:    s_xor_b32 s0, s0, s1
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mx = and i32 %x, %mask
@@ -22,10 +23,11 @@ define i64 @s_out64(i64 inreg %x, i64 inreg %y, i64 inreg %mask) {
 ; GCN-LABEL: s_out64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], s[16:17]
-; GCN-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[16:17]
+; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], s[16:17]
+; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mx = and i64 %x, %mask
@@ -427,10 +429,11 @@ define i32 @s_out_constant_varx_42(i32 inreg %x, i32 inreg %y, i32 inreg %mask)
 ; GCN-LABEL: s_out_constant_varx_42:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s0, s2, s0
-; GCN-NEXT:    s_and_not1_b32 s1, 42, s2
+; GCN-NEXT:    s_xor_b32 s0, s0, 42
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_xor_b32 s0, s0, 42
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %notmask = xor i32 %mask, -1
@@ -462,10 +465,11 @@ define i32 @s_out_constant_varx_42_invmask(i32 inreg %x, i32 inreg %y, i32 inreg
 ; GCN-LABEL: s_out_constant_varx_42_invmask:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_and_not1_b32 s0, s0, s2
-; GCN-NEXT:    s_and_b32 s1, s2, 42
+; GCN-NEXT:    s_xor_b32 s1, s0, 42
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s1, s1, s2
+; GCN-NEXT:    s_xor_b32 s0, s1, s0
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %notmask = xor i32 %mask, -1
@@ -560,10 +564,11 @@ define i32 @s_out_constant_42_vary(i32 inreg %x, i32 inreg %y, i32 inreg %mask)
 ; GCN-LABEL: s_out_constant_42_vary:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s0, s2, 42
-; GCN-NEXT:    s_and_not1_b32 s1, s1, s2
+; GCN-NEXT:    s_xor_b32 s0, s1, 42
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %notmask = xor i32 %mask, -1
@@ -595,10 +600,11 @@ define i32 @s_out_constant_42_vary_invmask(i32 inreg %x, i32 inreg %y, i32 inreg
 ; GCN-LABEL: s_out_constant_42_vary_invmask:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_and_not1_b32 s0, 42, s2
-; GCN-NEXT:    s_and_b32 s1, s2, s1
+; GCN-NEXT:    s_xor_b32 s0, s1, 42
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_xor_b32 s0, s0, 42
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %notmask = xor i32 %mask, -1
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 94f1b83..6480a88 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -355,12 +355,11 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, -1, v2
 ; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v3, vcc
-; GCN-IR-NEXT:    v_not_b32_e32 v7, v12
+; GCN-IR-NEXT:    v_not_b32_e32 v6, v12
 ; GCN-IR-NEXT:    v_lshr_b64 v[10:11], v[0:1], v8
-; GCN-IR-NEXT:    v_not_b32_e32 v6, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, v7, v13
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, v6, v13
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v6, vcc
+; GCN-IR-NEXT:    v_addc_u32_e64 v9, s[4:5], -1, 0, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
diff --git a/llvm/test/CodeGen/ARM/fpcmp_ueq.ll b/llvm/test/CodeGen/ARM/fpcmp_ueq.ll
index 698c750..f77720f 100644
--- a/llvm/test/CodeGen/ARM/fpcmp_ueq.ll
+++ b/llvm/test/CodeGen/ARM/fpcmp_ueq.ll
@@ -9,12 +9,13 @@ entry:
 }
 
 ; CHECK-ARMv4-LABEL: f7:
-; CHECK-ARMv4-DAG: bl ___eqsf2
-; CHECK-ARMv4-DAG: bl ___unordsf2
-; CHECK-ARMv4: cmp r0, #0
-; CHECK-ARMv4: movne r0, #1
-; CHECK-ARMv4: orrs r0, r0,
-; CHECK-ARMv4: moveq r0, #42
+; CHECK-ARMv4: bl ___eqsf2
+; CHECK-ARMv4-NEXT: rsbs r1, r0, #0
+; CHECK-ARMv4-NEXT: adc	r6, r0, r1
+
+; CHECK-ARMv4: bl ___unordsf2
+; CHECK-ARMv4-NEXT: orrs r0, r0, r6
+; CHECK-ARMv4-NEXT: mov r0, #154
 
 ; CHECK-ARMv7-LABEL: f7:
 ; CHECK-ARMv7: vcmp.f32
diff --git a/llvm/test/CodeGen/LoongArch/bf16-promote.ll b/llvm/test/CodeGen/LoongArch/bf16-promote.ll
new file mode 100644
index 0000000..42651eb
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/bf16-promote.ll
@@ -0,0 +1,172 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=loongarch64 -mattr=+d -target-abi=lp64d < %s | FileCheck --check-prefixes=CHECK,LA64 %s
+; RUN: llc -mtriple=loongarch32 -mattr=+d -target-abi=ilp32d < %s | FileCheck --check-prefixes=CHECK,LA32 %s
+
+define void @test_load_store(ptr %p, ptr %q) nounwind {
+; CHECK-LABEL: test_load_store:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.h $a0, $a0, 0
+; CHECK-NEXT:    st.h $a0, $a1, 0
+; CHECK-NEXT:    ret
+  %a = load bfloat, ptr %p
+  store bfloat %a, ptr %q
+  ret void
+}
+
+define float @test_fpextend_float(ptr %p) nounwind {
+; LA64-LABEL: test_fpextend_float:
+; LA64:       # %bb.0:
+; LA64-NEXT:    ld.hu $a0, $a0, 0
+; LA64-NEXT:    slli.d $a0, $a0, 16
+; LA64-NEXT:    movgr2fr.w $fa0, $a0
+; LA64-NEXT:    ret
+;
+; LA32-LABEL: test_fpextend_float:
+; LA32:       # %bb.0:
+; LA32-NEXT:    ld.hu $a0, $a0, 0
+; LA32-NEXT:    slli.w $a0, $a0, 16
+; LA32-NEXT:    movgr2fr.w $fa0, $a0
+; LA32-NEXT:    ret
+  %a = load bfloat, ptr %p
+  %r = fpext bfloat %a to float
+  ret float %r
+}
+
+define double @test_fpextend_double(ptr %p) nounwind {
+; LA64-LABEL: test_fpextend_double:
+; LA64:       # %bb.0:
+; LA64-NEXT:    ld.hu $a0, $a0, 0
+; LA64-NEXT:    slli.d $a0, $a0, 16
+; LA64-NEXT:    movgr2fr.w $fa0, $a0
+; LA64-NEXT:    fcvt.d.s $fa0, $fa0
+; LA64-NEXT:    ret
+;
+; LA32-LABEL: test_fpextend_double:
+; LA32:       # %bb.0:
+; LA32-NEXT:    ld.hu $a0, $a0, 0
+; LA32-NEXT:    slli.w $a0, $a0, 16
+; LA32-NEXT:    movgr2fr.w $fa0, $a0
+; LA32-NEXT:    fcvt.d.s $fa0, $fa0
+; LA32-NEXT:    ret
+  %a = load bfloat, ptr %p
+  %r = fpext bfloat %a to double
+  ret double %r
+}
+
+define void @test_fptrunc_float(float %f, ptr %p) nounwind {
+; LA64-LABEL: test_fptrunc_float:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; LA64-NEXT:    move $fp, $a0
+; LA64-NEXT:    pcaddu18i $ra, %call36(__truncsfbf2)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    movfr2gr.s $a0, $fa0
+; LA64-NEXT:    st.h $a0, $fp, 0
+; LA64-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+;
+; LA32-LABEL: test_fptrunc_float:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 8 # 4-byte Folded Spill
+; LA32-NEXT:    move $fp, $a0
+; LA32-NEXT:    bl __truncsfbf2
+; LA32-NEXT:    movfr2gr.s $a0, $fa0
+; LA32-NEXT:    st.h $a0, $fp, 0
+; LA32-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+  %a = fptrunc float %f to bfloat
+  store bfloat %a, ptr %p
+  ret void
+}
+
+define void @test_fptrunc_double(double %d, ptr %p) nounwind {
+; LA64-LABEL: test_fptrunc_double:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; LA64-NEXT:    move $fp, $a0
+; LA64-NEXT:    pcaddu18i $ra, %call36(__truncdfbf2)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    movfr2gr.s $a0, $fa0
+; LA64-NEXT:    st.h $a0, $fp, 0
+; LA64-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+;
+; LA32-LABEL: test_fptrunc_double:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 8 # 4-byte Folded Spill
+; LA32-NEXT:    move $fp, $a0
+; LA32-NEXT:    bl __truncdfbf2
+; LA32-NEXT:    movfr2gr.s $a0, $fa0
+; LA32-NEXT:    st.h $a0, $fp, 0
+; LA32-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+  %a = fptrunc double %d to bfloat
+  store bfloat %a, ptr %p
+  ret void
+}
+
+define void @test_fadd(ptr %p, ptr %q) nounwind {
+; LA64-LABEL: test_fadd:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; LA64-NEXT:    ld.hu $a1, $a1, 0
+; LA64-NEXT:    move $fp, $a0
+; LA64-NEXT:    ld.hu $a0, $a0, 0
+; LA64-NEXT:    slli.d $a1, $a1, 16
+; LA64-NEXT:    movgr2fr.w $fa0, $a1
+; LA64-NEXT:    slli.d $a0, $a0, 16
+; LA64-NEXT:    movgr2fr.w $fa1, $a0
+; LA64-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA64-NEXT:    pcaddu18i $ra, %call36(__truncsfbf2)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    movfr2gr.s $a0, $fa0
+; LA64-NEXT:    st.h $a0, $fp, 0
+; LA64-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+;
+; LA32-LABEL: test_fadd:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 8 # 4-byte Folded Spill
+; LA32-NEXT:    ld.hu $a1, $a1, 0
+; LA32-NEXT:    move $fp, $a0
+; LA32-NEXT:    ld.hu $a0, $a0, 0
+; LA32-NEXT:    slli.w $a1, $a1, 16
+; LA32-NEXT:    movgr2fr.w $fa0, $a1
+; LA32-NEXT:    slli.w $a0, $a0, 16
+; LA32-NEXT:    movgr2fr.w $fa1, $a0
+; LA32-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA32-NEXT:    bl __truncsfbf2
+; LA32-NEXT:    movfr2gr.s $a0, $fa0
+; LA32-NEXT:    st.h $a0, $fp, 0
+; LA32-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+  %a = load bfloat, ptr %p
+  %b = load bfloat, ptr %q
+  %r = fadd bfloat %a, %b
+  store bfloat %r, ptr %p
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/bf16.ll b/llvm/test/CodeGen/LoongArch/bf16.ll
new file mode 100644
index 0000000..e580bcc
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/bf16.ll
@@ -0,0 +1,1048 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+;; For `double` parameters and return values, compiling on loongarch32 with `-mattr=+d` and
+;; `-target-abi=ilp32s` is incompatible, resulting in the error 'Passing f64 with GPR on LA32 is undefined'.
+;; Therefore, such case is currently skipped in testing.
+; RUN: llc -mtriple=loongarch32 -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA32
+; RUN: llc -mtriple=loongarch64 -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA64
+; RUN: llc -mtriple=loongarch32 -mattr=+f -target-abi=ilp32s -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA32F-ILP32S
+; RUN: llc -mtriple=loongarch32 -mattr=+f -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA32F-ILP32D
+; RUN: llc -mtriple=loongarch32 -mattr=+d -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA32D-ILP32D
+; RUN: llc -mtriple=loongarch64 -mattr=+f -target-abi=lp64s -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA64F-LP64S
+; RUN: llc -mtriple=loongarch64 -mattr=+f -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA64F-LP64D
+; RUN: llc -mtriple=loongarch64 -mattr=+d -target-abi=lp64s -verify-machineinstrs < %s  | FileCheck %s -check-prefix=LA64D-LP64S
+; RUN: llc -mtriple=loongarch64 -mattr=+d -target-abi=lp64d -verify-machineinstrs < %s  | FileCheck %s -check-prefix=LA64D-LP64D
+
+define bfloat @float_to_bfloat(float %a) nounwind {
+; LA32-LABEL: float_to_bfloat:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    bl __truncsfbf2
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: float_to_bfloat:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    pcaddu18i $ra, %call36(__truncsfbf2)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    movfr2gr.s $a0, $fa0
+; LA64-NEXT:    lu12i.w $a1, -16
+; LA64-NEXT:    or $a0, $a0, $a1
+; LA64-NEXT:    movgr2fr.w $fa0, $a0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+;
+; LA32F-ILP32S-LABEL: float_to_bfloat:
+; LA32F-ILP32S:       # %bb.0:
+; LA32F-ILP32S-NEXT:    addi.w $sp, $sp, -16
+; LA32F-ILP32S-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32F-ILP32S-NEXT:    bl __truncsfbf2
+; LA32F-ILP32S-NEXT:    lu12i.w $a1, -16
+; LA32F-ILP32S-NEXT:    or $a0, $a0, $a1
+; LA32F-ILP32S-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32F-ILP32S-NEXT:    addi.w $sp, $sp, 16
+; LA32F-ILP32S-NEXT:    ret
+;
+; LA32F-ILP32D-LABEL: float_to_bfloat:
+; LA32F-ILP32D:       # %bb.0:
+; LA32F-ILP32D-NEXT:    addi.w $sp, $sp, -16
+; LA32F-ILP32D-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32F-ILP32D-NEXT:    bl __truncsfbf2
+; LA32F-ILP32D-NEXT:    movfr2gr.s $a0, $fa0
+; LA32F-ILP32D-NEXT:    lu12i.w $a1, -16
+; LA32F-ILP32D-NEXT:    or $a0, $a0, $a1
+; LA32F-ILP32D-NEXT:    movgr2fr.w $fa0, $a0
+; LA32F-ILP32D-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32F-ILP32D-NEXT:    addi.w $sp, $sp, 16
+; LA32F-ILP32D-NEXT:    ret
+;
+; LA32D-ILP32D-LABEL: float_to_bfloat:
+; LA32D-ILP32D:       # %bb.0:
+; LA32D-ILP32D-NEXT:    addi.w $sp, $sp, -16
+; LA32D-ILP32D-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32D-ILP32D-NEXT:    bl __truncsfbf2
+; LA32D-ILP32D-NEXT:    movfr2gr.s $a0, $fa0
+; LA32D-ILP32D-NEXT:    lu12i.w $a1, -16
+; LA32D-ILP32D-NEXT:    or $a0, $a0, $a1
+; LA32D-ILP32D-NEXT:    movgr2fr.w $fa0, $a0
+; LA32D-ILP32D-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32D-ILP32D-NEXT:    addi.w $sp, $sp, 16
+; LA32D-ILP32D-NEXT:    ret
+;
+; LA64F-LP64S-LABEL: float_to_bfloat:
+; LA64F-LP64S:       # %bb.0:
+; LA64F-LP64S-NEXT:    addi.d $sp, $sp, -16
+; LA64F-LP64S-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64F-LP64S-NEXT:    pcaddu18i $ra, %call36(__truncsfbf2)
+; LA64F-LP64S-NEXT:    jirl $ra, $ra, 0
+; LA64F-LP64S-NEXT:    lu12i.w $a1, -16
+; LA64F-LP64S-NEXT:    or $a0, $a0, $a1
+; LA64F-LP64S-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64F-LP64S-NEXT:    addi.d $sp, $sp, 16
+; LA64F-LP64S-NEXT:    ret
+;
+; LA64F-LP64D-LABEL: float_to_bfloat:
+; LA64F-LP64D:       # %bb.0:
+; LA64F-LP64D-NEXT:    addi.d $sp, $sp, -16
+; LA64F-LP64D-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64F-LP64D-NEXT:    pcaddu18i $ra, %call36(__truncsfbf2)
+; LA64F-LP64D-NEXT:    jirl $ra, $ra, 0
+; LA64F-LP64D-NEXT:    movfr2gr.s $a0, $fa0
+; LA64F-LP64D-NEXT:    lu12i.w $a1, -16
+; LA64F-LP64D-NEXT:    or $a0, $a0, $a1
+; LA64F-LP64D-NEXT:    movgr2fr.w $fa0, $a0
+; LA64F-LP64D-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64F-LP64D-NEXT:    addi.d $sp, $sp, 16
+; LA64F-LP64D-NEXT:    ret
+;
+; LA64D-LP64S-LABEL: float_to_bfloat:
+; LA64D-LP64S:       # %bb.0:
+; LA64D-LP64S-NEXT:    addi.d $sp, $sp, -16
+; LA64D-LP64S-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64D-LP64S-NEXT:    pcaddu18i $ra, %call36(__truncsfbf2)
+; LA64D-LP64S-NEXT:    jirl $ra, $ra, 0
+; LA64D-LP64S-NEXT:    lu12i.w $a1, -16
+; LA64D-LP64S-NEXT:    or $a0, $a0, $a1
+; LA64D-LP64S-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64D-LP64S-NEXT:    addi.d $sp, $sp, 16
+; LA64D-LP64S-NEXT:    ret
+;
+; LA64D-LP64D-LABEL: float_to_bfloat:
+; LA64D-LP64D:       # %bb.0:
+; LA64D-LP64D-NEXT:    addi.d $sp, $sp, -16
+; LA64D-LP64D-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64D-LP64D-NEXT:    pcaddu18i $ra, %call36(__truncsfbf2)
+; LA64D-LP64D-NEXT:    jirl $ra, $ra, 0
+; LA64D-LP64D-NEXT:    movfr2gr.s $a0, $fa0
+; LA64D-LP64D-NEXT:    lu12i.w $a1, -16
+; LA64D-LP64D-NEXT:    or $a0, $a0, $a1
+; LA64D-LP64D-NEXT:    movgr2fr.w $fa0, $a0
+; LA64D-LP64D-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64D-LP64D-NEXT:    addi.d $sp, $sp, 16
+; LA64D-LP64D-NEXT:    ret
+  %1 = fptrunc float %a to bfloat
+  ret bfloat %1
+}
+
+define bfloat @double_to_bfloat(double %a) nounwind {
+; LA32-LABEL: double_to_bfloat:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    bl __truncdfbf2
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: double_to_bfloat:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    pcaddu18i $ra, %call36(__truncdfbf2)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    movfr2gr.s $a0, $fa0
+; LA64-NEXT:    lu12i.w $a1, -16
+; LA64-NEXT:    or $a0, $a0, $a1
+; LA64-NEXT:    movgr2fr.w $fa0, $a0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+;
+; LA32F-ILP32S-LABEL: double_to_bfloat:
+; LA32F-ILP32S:       # %bb.0:
+; LA32F-ILP32S-NEXT:    addi.w $sp, $sp, -16
+; LA32F-ILP32S-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32F-ILP32S-NEXT:    bl __truncdfbf2
+; LA32F-ILP32S-NEXT:    lu12i.w $a1, -16
+; LA32F-ILP32S-NEXT:    or $a0, $a0, $a1
+; LA32F-ILP32S-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32F-ILP32S-NEXT:    addi.w $sp, $sp, 16
+; LA32F-ILP32S-NEXT:    ret
+;
+; LA32F-ILP32D-LABEL: double_to_bfloat:
+; LA32F-ILP32D:       # %bb.0:
+; LA32F-ILP32D-NEXT:    addi.w $sp, $sp, -16
+; LA32F-ILP32D-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32F-ILP32D-NEXT:    bl __truncdfbf2
+; LA32F-ILP32D-NEXT:    movfr2gr.s $a0, $fa0
+; LA32F-ILP32D-NEXT:    lu12i.w $a1, -16
+; LA32F-ILP32D-NEXT:    or $a0, $a0, $a1
+; LA32F-ILP32D-NEXT:    movgr2fr.w $fa0, $a0
+; LA32F-ILP32D-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32F-ILP32D-NEXT:    addi.w $sp, $sp, 16
+; LA32F-ILP32D-NEXT:    ret
+;
+; LA32D-ILP32D-LABEL: double_to_bfloat:
+; LA32D-ILP32D:       # %bb.0:
+; LA32D-ILP32D-NEXT:    addi.w $sp, $sp, -16
+; LA32D-ILP32D-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32D-ILP32D-NEXT:    bl __truncdfbf2
+; LA32D-ILP32D-NEXT:    movfr2gr.s $a0, $fa0
+; LA32D-ILP32D-NEXT:    lu12i.w $a1, -16
+; LA32D-ILP32D-NEXT:    or $a0, $a0, $a1
+; LA32D-ILP32D-NEXT:    movgr2fr.w $fa0, $a0
+; LA32D-ILP32D-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32D-ILP32D-NEXT:    addi.w $sp, $sp, 16
+; LA32D-ILP32D-NEXT:    ret
+;
+; LA64F-LP64S-LABEL: double_to_bfloat:
+; LA64F-LP64S:       # %bb.0:
+; LA64F-LP64S-NEXT:    addi.d $sp, $sp, -16
+; LA64F-LP64S-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64F-LP64S-NEXT:    pcaddu18i $ra, %call36(__truncdfbf2)
+; LA64F-LP64S-NEXT:    jirl $ra, $ra, 0
+; LA64F-LP64S-NEXT:    lu12i.w $a1, -16
+; LA64F-LP64S-NEXT:    or $a0, $a0, $a1
+; LA64F-LP64S-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64F-LP64S-NEXT:    addi.d $sp, $sp, 16
+; LA64F-LP64S-NEXT:    ret
+;
+; LA64F-LP64D-LABEL: double_to_bfloat:
+; LA64F-LP64D:       # %bb.0:
+; LA64F-LP64D-NEXT:    addi.d $sp, $sp, -16
+; LA64F-LP64D-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64F-LP64D-NEXT:    pcaddu18i $ra, %call36(__truncdfbf2)
+; LA64F-LP64D-NEXT:    jirl $ra, $ra, 0
+; LA64F-LP64D-NEXT:    movfr2gr.s $a0, $fa0
+; LA64F-LP64D-NEXT:    lu12i.w $a1, -16
+; LA64F-LP64D-NEXT:    or $a0, $a0, $a1
+; LA64F-LP64D-NEXT:    movgr2fr.w $fa0, $a0
+; LA64F-LP64D-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64F-LP64D-NEXT:    addi.d $sp, $sp, 16
+; LA64F-LP64D-NEXT:    ret
+;
+; LA64D-LP64S-LABEL: double_to_bfloat:
+; LA64D-LP64S:       # %bb.0:
+; LA64D-LP64S-NEXT:    addi.d $sp, $sp, -16
+; LA64D-LP64S-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64D-LP64S-NEXT:    pcaddu18i $ra, %call36(__truncdfbf2)
+; LA64D-LP64S-NEXT:    jirl $ra, $ra, 0
+; LA64D-LP64S-NEXT:    lu12i.w $a1, -16
+; LA64D-LP64S-NEXT:    or $a0, $a0, $a1
+; LA64D-LP64S-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64D-LP64S-NEXT:    addi.d $sp, $sp, 16
+; LA64D-LP64S-NEXT:    ret
+;
+; LA64D-LP64D-LABEL: double_to_bfloat:
+; LA64D-LP64D:       # %bb.0:
+; LA64D-LP64D-NEXT:    addi.d $sp, $sp, -16
+; LA64D-LP64D-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64D-LP64D-NEXT:    pcaddu18i $ra, %call36(__truncdfbf2)
+; LA64D-LP64D-NEXT:    jirl $ra, $ra, 0
+; LA64D-LP64D-NEXT:    movfr2gr.s $a0, $fa0
+; LA64D-LP64D-NEXT:    lu12i.w $a1, -16
+; LA64D-LP64D-NEXT:    or $a0, $a0, $a1
+; LA64D-LP64D-NEXT:    movgr2fr.w $fa0, $a0
+; LA64D-LP64D-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64D-LP64D-NEXT:    addi.d $sp, $sp, 16
+; LA64D-LP64D-NEXT:    ret
+  %1 = fptrunc double %a to bfloat
+  ret bfloat %1
+}
+
+define float @bfloat_to_float(bfloat %a) nounwind {
+; LA32-LABEL: bfloat_to_float:
+; LA32:       # %bb.0:
+; LA32-NEXT:    slli.w $a0, $a0, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: bfloat_to_float:
+; LA64:       # %bb.0:
+; LA64-NEXT:    movfr2gr.s $a0, $fa0
+; LA64-NEXT:    slli.d $a0, $a0, 16
+; LA64-NEXT:    movgr2fr.w $fa0, $a0
+; LA64-NEXT:    ret
+;
+; LA32F-ILP32S-LABEL: bfloat_to_float:
+; LA32F-ILP32S:       # %bb.0:
+; LA32F-ILP32S-NEXT:    slli.w $a0, $a0, 16
+; LA32F-ILP32S-NEXT:    ret
+;
+; LA32F-ILP32D-LABEL: bfloat_to_float:
+; LA32F-ILP32D:       # %bb.0:
+; LA32F-ILP32D-NEXT:    movfr2gr.s $a0, $fa0
+; LA32F-ILP32D-NEXT:    slli.w $a0, $a0, 16
+; LA32F-ILP32D-NEXT:    movgr2fr.w $fa0, $a0
+; LA32F-ILP32D-NEXT:    ret
+;
+; LA32D-ILP32D-LABEL: bfloat_to_float:
+; LA32D-ILP32D:       # %bb.0:
+; LA32D-ILP32D-NEXT:    movfr2gr.s $a0, $fa0
+; LA32D-ILP32D-NEXT:    slli.w $a0, $a0, 16
+; LA32D-ILP32D-NEXT:    movgr2fr.w $fa0, $a0
+; LA32D-ILP32D-NEXT:    ret
+;
+; LA64F-LP64S-LABEL: bfloat_to_float:
+; LA64F-LP64S:       # %bb.0:
+; LA64F-LP64S-NEXT:    slli.d $a0, $a0, 16
+; LA64F-LP64S-NEXT:    ret
+;
+; LA64F-LP64D-LABEL: bfloat_to_float:
+; LA64F-LP64D:       # %bb.0:
+; LA64F-LP64D-NEXT:    movfr2gr.s $a0, $fa0
+; LA64F-LP64D-NEXT:    slli.d $a0, $a0, 16
+; LA64F-LP64D-NEXT:    movgr2fr.w $fa0, $a0
+; LA64F-LP64D-NEXT:    ret
+;
+; LA64D-LP64S-LABEL: bfloat_to_float:
+; LA64D-LP64S:       # %bb.0:
+; LA64D-LP64S-NEXT:    slli.d $a0, $a0, 16
+; LA64D-LP64S-NEXT:    ret
+;
+; LA64D-LP64D-LABEL: bfloat_to_float:
+; LA64D-LP64D:       # %bb.0:
+; LA64D-LP64D-NEXT:    movfr2gr.s $a0, $fa0
+; LA64D-LP64D-NEXT:    slli.d $a0, $a0, 16
+; LA64D-LP64D-NEXT:    movgr2fr.w $fa0, $a0
+; LA64D-LP64D-NEXT:    ret
+  %1 = fpext bfloat %a to float
+  ret float %1
+}
+
+define double @bfloat_to_double(bfloat %a) nounwind {
+; LA32-LABEL: bfloat_to_double:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    slli.w $a0, $a0, 16
+; LA32-NEXT:    bl __extendsfdf2
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: bfloat_to_double:
+; LA64:       # %bb.0:
+; LA64-NEXT:    movfr2gr.s $a0, $fa0
+; LA64-NEXT:    slli.d $a0, $a0, 16
+; LA64-NEXT:    movgr2fr.w $fa0, $a0
+; LA64-NEXT:    fcvt.d.s $fa0, $fa0
+; LA64-NEXT:    ret
+;
+; LA32F-ILP32S-LABEL: bfloat_to_double:
+; LA32F-ILP32S:       # %bb.0:
+; LA32F-ILP32S-NEXT:    addi.w $sp, $sp, -16
+; LA32F-ILP32S-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32F-ILP32S-NEXT:    slli.w $a0, $a0, 16
+; LA32F-ILP32S-NEXT:    bl __extendsfdf2
+; LA32F-ILP32S-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32F-ILP32S-NEXT:    addi.w $sp, $sp, 16
+; LA32F-ILP32S-NEXT:    ret
+;
+; LA32F-ILP32D-LABEL: bfloat_to_double:
+; LA32F-ILP32D:       # %bb.0:
+; LA32F-ILP32D-NEXT:    addi.w $sp, $sp, -16
+; LA32F-ILP32D-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32F-ILP32D-NEXT:    movfr2gr.s $a0, $fa0
+; LA32F-ILP32D-NEXT:    slli.w $a0, $a0, 16
+; LA32F-ILP32D-NEXT:    movgr2fr.w $fa0, $a0
+; LA32F-ILP32D-NEXT:    bl __extendsfdf2
+; LA32F-ILP32D-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32F-ILP32D-NEXT:    addi.w $sp, $sp, 16
+; LA32F-ILP32D-NEXT:    ret
+;
+; LA32D-ILP32D-LABEL: bfloat_to_double:
+; LA32D-ILP32D:       # %bb.0:
+; LA32D-ILP32D-NEXT:    movfr2gr.s $a0, $fa0
+; LA32D-ILP32D-NEXT:    slli.w $a0, $a0, 16
+; LA32D-ILP32D-NEXT:    movgr2fr.w $fa0, $a0
+; LA32D-ILP32D-NEXT:    fcvt.d.s $fa0, $fa0
+; LA32D-ILP32D-NEXT:    ret
+;
+; LA64F-LP64S-LABEL: bfloat_to_double:
+; LA64F-LP64S:       # %bb.0:
+; LA64F-LP64S-NEXT:    slli.d $a0, $a0, 16
+; LA64F-LP64S-NEXT:    movgr2fr.w $fa0, $a0
+; LA64F-LP64S-NEXT:    fcvt.d.s $fa0, $fa0
+; LA64F-LP64S-NEXT:    movfr2gr.d $a0, $fa0
+; LA64F-LP64S-NEXT:    ret
+;
+; LA64F-LP64D-LABEL: bfloat_to_double:
+; LA64F-LP64D:       # %bb.0:
+; LA64F-LP64D-NEXT:    movfr2gr.s $a0, $fa0
+; LA64F-LP64D-NEXT:    slli.d $a0, $a0, 16
+; LA64F-LP64D-NEXT:    movgr2fr.w $fa0, $a0
+; LA64F-LP64D-NEXT:    fcvt.d.s $fa0, $fa0
+; LA64F-LP64D-NEXT:    ret
+;
+; LA64D-LP64S-LABEL: bfloat_to_double:
+; LA64D-LP64S:       # %bb.0:
+; LA64D-LP64S-NEXT:    slli.d $a0, $a0, 16
+; LA64D-LP64S-NEXT:    movgr2fr.w $fa0, $a0
+; LA64D-LP64S-NEXT:    fcvt.d.s $fa0, $fa0
+; LA64D-LP64S-NEXT:    movfr2gr.d $a0, $fa0
+; LA64D-LP64S-NEXT:    ret
+;
+; LA64D-LP64D-LABEL: bfloat_to_double:
+; LA64D-LP64D:       # %bb.0:
+; LA64D-LP64D-NEXT:    movfr2gr.s $a0, $fa0
+; LA64D-LP64D-NEXT:    slli.d $a0, $a0, 16
+; LA64D-LP64D-NEXT:    movgr2fr.w $fa0, $a0
+; LA64D-LP64D-NEXT:    fcvt.d.s $fa0, $fa0
+; LA64D-LP64D-NEXT:    ret
+  %1 = fpext bfloat %a to double
+  ret double %1
+}
+
+define bfloat @i16_to_bfloat(i16 %a) nounwind {
+; LA32-LABEL: i16_to_bfloat:
+; LA32:       # %bb.0:
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: i16_to_bfloat:
+; LA64:       # %bb.0:
+; LA64-NEXT:    lu12i.w $a1, -16
+; LA64-NEXT:    or $a0, $a0, $a1
+; LA64-NEXT:    movgr2fr.w $fa0, $a0
+; LA64-NEXT:    ret
+;
+; LA32F-ILP32S-LABEL: i16_to_bfloat:
+; LA32F-ILP32S:       # %bb.0:
+; LA32F-ILP32S-NEXT:    lu12i.w $a1, -16
+; LA32F-ILP32S-NEXT:    or $a0, $a0, $a1
+; LA32F-ILP32S-NEXT:    ret
+;
+; LA32F-ILP32D-LABEL: i16_to_bfloat:
+; LA32F-ILP32D:       # %bb.0:
+; LA32F-ILP32D-NEXT:    lu12i.w $a1, -16
+; LA32F-ILP32D-NEXT:    or $a0, $a0, $a1
+; LA32F-ILP32D-NEXT:    movgr2fr.w $fa0, $a0
+; LA32F-ILP32D-NEXT:    ret
+;
+; LA32D-ILP32D-LABEL: i16_to_bfloat:
+; LA32D-ILP32D:       # %bb.0:
+; LA32D-ILP32D-NEXT:    lu12i.w $a1, -16
+; LA32D-ILP32D-NEXT:    or $a0, $a0, $a1
+; LA32D-ILP32D-NEXT:    movgr2fr.w $fa0, $a0
+; LA32D-ILP32D-NEXT:    ret
+;
+; LA64F-LP64S-LABEL: i16_to_bfloat:
+; LA64F-LP64S:       # %bb.0:
+; LA64F-LP64S-NEXT:    lu12i.w $a1, -16
+; LA64F-LP64S-NEXT:    or $a0, $a0, $a1
+; LA64F-LP64S-NEXT:    ret
+;
+; LA64F-LP64D-LABEL: i16_to_bfloat:
+; LA64F-LP64D:       # %bb.0:
+; LA64F-LP64D-NEXT:    lu12i.w $a1, -16
+; LA64F-LP64D-NEXT:    or $a0, $a0, $a1
+; LA64F-LP64D-NEXT:    movgr2fr.w $fa0, $a0
+; LA64F-LP64D-NEXT:    ret
+;
+; LA64D-LP64S-LABEL: i16_to_bfloat:
+; LA64D-LP64S:       # %bb.0:
+; LA64D-LP64S-NEXT:    lu12i.w $a1, -16
+; LA64D-LP64S-NEXT:    or $a0, $a0, $a1
+; LA64D-LP64S-NEXT:    ret
+;
+; LA64D-LP64D-LABEL: i16_to_bfloat:
+; LA64D-LP64D:       # %bb.0:
+; LA64D-LP64D-NEXT:    lu12i.w $a1, -16
+; LA64D-LP64D-NEXT:    or $a0, $a0, $a1
+; LA64D-LP64D-NEXT:    movgr2fr.w $fa0, $a0
+; LA64D-LP64D-NEXT:    ret
+  %1 = bitcast i16 %a to bfloat
+  ret bfloat %1
+}
+
+define i16 @bfloat_to_i16(bfloat %a) nounwind {
+; LA32-LABEL: bfloat_to_i16:
+; LA32:       # %bb.0:
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: bfloat_to_i16:
+; LA64:       # %bb.0:
+; LA64-NEXT:    movfr2gr.s $a0, $fa0
+; LA64-NEXT:    ret
+;
+; LA32F-ILP32S-LABEL: bfloat_to_i16:
+; LA32F-ILP32S:       # %bb.0:
+; LA32F-ILP32S-NEXT:    ret
+;
+; LA32F-ILP32D-LABEL: bfloat_to_i16:
+; LA32F-ILP32D:       # %bb.0:
+; LA32F-ILP32D-NEXT:    movfr2gr.s $a0, $fa0
+; LA32F-ILP32D-NEXT:    ret
+;
+; LA32D-ILP32D-LABEL: bfloat_to_i16:
+; LA32D-ILP32D:       # %bb.0:
+; LA32D-ILP32D-NEXT:    movfr2gr.s $a0, $fa0
+; LA32D-ILP32D-NEXT:    ret
+;
+; LA64F-LP64S-LABEL: bfloat_to_i16:
+; LA64F-LP64S:       # %bb.0:
+; LA64F-LP64S-NEXT:    ret
+;
+; LA64F-LP64D-LABEL: bfloat_to_i16:
+; LA64F-LP64D:       # %bb.0:
+; LA64F-LP64D-NEXT:    movfr2gr.s $a0, $fa0
+; LA64F-LP64D-NEXT:    ret
+;
+; LA64D-LP64S-LABEL: bfloat_to_i16:
+; LA64D-LP64S:       # %bb.0:
+; LA64D-LP64S-NEXT:    ret
+;
+; LA64D-LP64D-LABEL: bfloat_to_i16:
+; LA64D-LP64D:       # %bb.0:
+; LA64D-LP64D-NEXT:    movfr2gr.s $a0, $fa0
+; LA64D-LP64D-NEXT:    ret
+  %1 = bitcast bfloat %a to i16
+  ret i16 %1
+}
+
+define bfloat @bfloat_add(bfloat %a, bfloat %b) nounwind {
+; LA32-LABEL: bfloat_add:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    slli.w $a0, $a0, 16
+; LA32-NEXT:    slli.w $a1, $a1, 16
+; LA32-NEXT:    bl __addsf3
+; LA32-NEXT:    bl __truncsfbf2
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: bfloat_add:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    movfr2gr.s $a0, $fa0
+; LA64-NEXT:    movfr2gr.s $a1, $fa1
+; LA64-NEXT:    slli.d $a1, $a1, 16
+; LA64-NEXT:    movgr2fr.w $fa0, $a1
+; LA64-NEXT:    slli.d $a0, $a0, 16
+; LA64-NEXT:    movgr2fr.w $fa1, $a0
+; LA64-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA64-NEXT:    pcaddu18i $ra, %call36(__truncsfbf2)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    movfr2gr.s $a0, $fa0
+; LA64-NEXT:    lu12i.w $a1, -16
+; LA64-NEXT:    or $a0, $a0, $a1
+; LA64-NEXT:    movgr2fr.w $fa0, $a0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+;
+; LA32F-ILP32S-LABEL: bfloat_add:
+; LA32F-ILP32S:       # %bb.0:
+; LA32F-ILP32S-NEXT:    addi.w $sp, $sp, -16
+; LA32F-ILP32S-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32F-ILP32S-NEXT:    slli.w $a1, $a1, 16
+; LA32F-ILP32S-NEXT:    movgr2fr.w $fa0, $a1
+; LA32F-ILP32S-NEXT:    slli.w $a0, $a0, 16
+; LA32F-ILP32S-NEXT:    movgr2fr.w $fa1, $a0
+; LA32F-ILP32S-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA32F-ILP32S-NEXT:    movfr2gr.s $a0, $fa0
+; LA32F-ILP32S-NEXT:    bl __truncsfbf2
+; LA32F-ILP32S-NEXT:    lu12i.w $a1, -16
+; LA32F-ILP32S-NEXT:    or $a0, $a0, $a1
+; LA32F-ILP32S-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32F-ILP32S-NEXT:    addi.w $sp, $sp, 16
+; LA32F-ILP32S-NEXT:    ret
+;
+; LA32F-ILP32D-LABEL: bfloat_add:
+; LA32F-ILP32D:       # %bb.0:
+; LA32F-ILP32D-NEXT:    addi.w $sp, $sp, -16
+; LA32F-ILP32D-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32F-ILP32D-NEXT:    movfr2gr.s $a0, $fa0
+; LA32F-ILP32D-NEXT:    movfr2gr.s $a1, $fa1
+; LA32F-ILP32D-NEXT:    slli.w $a1, $a1, 16
+; LA32F-ILP32D-NEXT:    movgr2fr.w $fa0, $a1
+; LA32F-ILP32D-NEXT:    slli.w $a0, $a0, 16
+; LA32F-ILP32D-NEXT:    movgr2fr.w $fa1, $a0
+; LA32F-ILP32D-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA32F-ILP32D-NEXT:    bl __truncsfbf2
+; LA32F-ILP32D-NEXT:    movfr2gr.s $a0, $fa0
+; LA32F-ILP32D-NEXT:    lu12i.w $a1, -16
+; LA32F-ILP32D-NEXT:    or $a0, $a0, $a1
+; LA32F-ILP32D-NEXT:    movgr2fr.w $fa0, $a0
+; LA32F-ILP32D-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32F-ILP32D-NEXT:    addi.w $sp, $sp, 16
+; LA32F-ILP32D-NEXT:    ret
+;
+; LA32D-ILP32D-LABEL: bfloat_add:
+; LA32D-ILP32D:       # %bb.0:
+; LA32D-ILP32D-NEXT:    addi.w $sp, $sp, -16
+; LA32D-ILP32D-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32D-ILP32D-NEXT:    movfr2gr.s $a0, $fa0
+; LA32D-ILP32D-NEXT:    movfr2gr.s $a1, $fa1
+; LA32D-ILP32D-NEXT:    slli.w $a1, $a1, 16
+; LA32D-ILP32D-NEXT:    movgr2fr.w $fa0, $a1
+; LA32D-ILP32D-NEXT:    slli.w $a0, $a0, 16
+; LA32D-ILP32D-NEXT:    movgr2fr.w $fa1, $a0
+; LA32D-ILP32D-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA32D-ILP32D-NEXT:    bl __truncsfbf2
+; LA32D-ILP32D-NEXT:    movfr2gr.s $a0, $fa0
+; LA32D-ILP32D-NEXT:    lu12i.w $a1, -16
+; LA32D-ILP32D-NEXT:    or $a0, $a0, $a1
+; LA32D-ILP32D-NEXT:    movgr2fr.w $fa0, $a0
+; LA32D-ILP32D-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32D-ILP32D-NEXT:    addi.w $sp, $sp, 16
+; LA32D-ILP32D-NEXT:    ret
+;
+; LA64F-LP64S-LABEL: bfloat_add:
+; LA64F-LP64S:       # %bb.0:
+; LA64F-LP64S-NEXT:    addi.d $sp, $sp, -16
+; LA64F-LP64S-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64F-LP64S-NEXT:    slli.d $a1, $a1, 16
+; LA64F-LP64S-NEXT:    movgr2fr.w $fa0, $a1
+; LA64F-LP64S-NEXT:    slli.d $a0, $a0, 16
+; LA64F-LP64S-NEXT:    movgr2fr.w $fa1, $a0
+; LA64F-LP64S-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA64F-LP64S-NEXT:    movfr2gr.s $a0, $fa0
+; LA64F-LP64S-NEXT:    pcaddu18i $ra, %call36(__truncsfbf2)
+; LA64F-LP64S-NEXT:    jirl $ra, $ra, 0
+; LA64F-LP64S-NEXT:    lu12i.w $a1, -16
+; LA64F-LP64S-NEXT:    or $a0, $a0, $a1
+; LA64F-LP64S-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64F-LP64S-NEXT:    addi.d $sp, $sp, 16
+; LA64F-LP64S-NEXT:    ret
+;
+; LA64F-LP64D-LABEL: bfloat_add:
+; LA64F-LP64D:       # %bb.0:
+; LA64F-LP64D-NEXT:    addi.d $sp, $sp, -16
+; LA64F-LP64D-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64F-LP64D-NEXT:    movfr2gr.s $a0, $fa0
+; LA64F-LP64D-NEXT:    movfr2gr.s $a1, $fa1
+; LA64F-LP64D-NEXT:    slli.d $a1, $a1, 16
+; LA64F-LP64D-NEXT:    movgr2fr.w $fa0, $a1
+; LA64F-LP64D-NEXT:    slli.d $a0, $a0, 16
+; LA64F-LP64D-NEXT:    movgr2fr.w $fa1, $a0
+; LA64F-LP64D-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA64F-LP64D-NEXT:    pcaddu18i $ra, %call36(__truncsfbf2)
+; LA64F-LP64D-NEXT:    jirl $ra, $ra, 0
+; LA64F-LP64D-NEXT:    movfr2gr.s $a0, $fa0
+; LA64F-LP64D-NEXT:    lu12i.w $a1, -16
+; LA64F-LP64D-NEXT:    or $a0, $a0, $a1
+; LA64F-LP64D-NEXT:    movgr2fr.w $fa0, $a0
+; LA64F-LP64D-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64F-LP64D-NEXT:    addi.d $sp, $sp, 16
+; LA64F-LP64D-NEXT:    ret
+;
+; LA64D-LP64S-LABEL: bfloat_add:
+; LA64D-LP64S:       # %bb.0:
+; LA64D-LP64S-NEXT:    addi.d $sp, $sp, -16
+; LA64D-LP64S-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64D-LP64S-NEXT:    slli.d $a1, $a1, 16
+; LA64D-LP64S-NEXT:    movgr2fr.w $fa0, $a1
+; LA64D-LP64S-NEXT:    slli.d $a0, $a0, 16
+; LA64D-LP64S-NEXT:    movgr2fr.w $fa1, $a0
+; LA64D-LP64S-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA64D-LP64S-NEXT:    movfr2gr.s $a0, $fa0
+; LA64D-LP64S-NEXT:    pcaddu18i $ra, %call36(__truncsfbf2)
+; LA64D-LP64S-NEXT:    jirl $ra, $ra, 0
+; LA64D-LP64S-NEXT:    lu12i.w $a1, -16
+; LA64D-LP64S-NEXT:    or $a0, $a0, $a1
+; LA64D-LP64S-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64D-LP64S-NEXT:    addi.d $sp, $sp, 16
+; LA64D-LP64S-NEXT:    ret
+;
+; LA64D-LP64D-LABEL: bfloat_add:
+; LA64D-LP64D:       # %bb.0:
+; LA64D-LP64D-NEXT:    addi.d $sp, $sp, -16
+; LA64D-LP64D-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64D-LP64D-NEXT:    movfr2gr.s $a0, $fa0
+; LA64D-LP64D-NEXT:    movfr2gr.s $a1, $fa1
+; LA64D-LP64D-NEXT:    slli.d $a1, $a1, 16
+; LA64D-LP64D-NEXT:    movgr2fr.w $fa0, $a1
+; LA64D-LP64D-NEXT:    slli.d $a0, $a0, 16
+; LA64D-LP64D-NEXT:    movgr2fr.w $fa1, $a0
+; LA64D-LP64D-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA64D-LP64D-NEXT:    pcaddu18i $ra, %call36(__truncsfbf2)
+; LA64D-LP64D-NEXT:    jirl $ra, $ra, 0
+; LA64D-LP64D-NEXT:    movfr2gr.s $a0, $fa0
+; LA64D-LP64D-NEXT:    lu12i.w $a1, -16
+; LA64D-LP64D-NEXT:    or $a0, $a0, $a1
+; LA64D-LP64D-NEXT:    movgr2fr.w $fa0, $a0
+; LA64D-LP64D-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64D-LP64D-NEXT:    addi.d $sp, $sp, 16
+; LA64D-LP64D-NEXT:    ret
+  %1 = fadd bfloat %a, %b
+  ret bfloat %1
+}
+
+define bfloat @bfloat_load(ptr %a) nounwind {
+; LA32-LABEL: bfloat_load:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ld.h $a1, $a0, 0
+; LA32-NEXT:    ld.h $a2, $a0, 6
+; LA32-NEXT:    slli.w $a0, $a1, 16
+; LA32-NEXT:    slli.w $a1, $a2, 16
+; LA32-NEXT:    bl __addsf3
+; LA32-NEXT:    bl __truncsfbf2
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: bfloat_load:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    ld.hu $a1, $a0, 6
+; LA64-NEXT:    ld.hu $a0, $a0, 0
+; LA64-NEXT:    slli.d $a1, $a1, 16
+; LA64-NEXT:    movgr2fr.w $fa0, $a1
+; LA64-NEXT:    slli.d $a0, $a0, 16
+; LA64-NEXT:    movgr2fr.w $fa1, $a0
+; LA64-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA64-NEXT:    pcaddu18i $ra, %call36(__truncsfbf2)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    movfr2gr.s $a0, $fa0
+; LA64-NEXT:    lu12i.w $a1, -16
+; LA64-NEXT:    or $a0, $a0, $a1
+; LA64-NEXT:    movgr2fr.w $fa0, $a0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+;
+; LA32F-ILP32S-LABEL: bfloat_load:
+; LA32F-ILP32S:       # %bb.0:
+; LA32F-ILP32S-NEXT:    addi.w $sp, $sp, -16
+; LA32F-ILP32S-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32F-ILP32S-NEXT:    ld.hu $a1, $a0, 6
+; LA32F-ILP32S-NEXT:    ld.hu $a0, $a0, 0
+; LA32F-ILP32S-NEXT:    slli.w $a1, $a1, 16
+; LA32F-ILP32S-NEXT:    movgr2fr.w $fa0, $a1
+; LA32F-ILP32S-NEXT:    slli.w $a0, $a0, 16
+; LA32F-ILP32S-NEXT:    movgr2fr.w $fa1, $a0
+; LA32F-ILP32S-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA32F-ILP32S-NEXT:    movfr2gr.s $a0, $fa0
+; LA32F-ILP32S-NEXT:    bl __truncsfbf2
+; LA32F-ILP32S-NEXT:    lu12i.w $a1, -16
+; LA32F-ILP32S-NEXT:    or $a0, $a0, $a1
+; LA32F-ILP32S-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32F-ILP32S-NEXT:    addi.w $sp, $sp, 16
+; LA32F-ILP32S-NEXT:    ret
+;
+; LA32F-ILP32D-LABEL: bfloat_load:
+; LA32F-ILP32D:       # %bb.0:
+; LA32F-ILP32D-NEXT:    addi.w $sp, $sp, -16
+; LA32F-ILP32D-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32F-ILP32D-NEXT:    ld.hu $a1, $a0, 6
+; LA32F-ILP32D-NEXT:    ld.hu $a0, $a0, 0
+; LA32F-ILP32D-NEXT:    slli.w $a1, $a1, 16
+; LA32F-ILP32D-NEXT:    movgr2fr.w $fa0, $a1
+; LA32F-ILP32D-NEXT:    slli.w $a0, $a0, 16
+; LA32F-ILP32D-NEXT:    movgr2fr.w $fa1, $a0
+; LA32F-ILP32D-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA32F-ILP32D-NEXT:    bl __truncsfbf2
+; LA32F-ILP32D-NEXT:    movfr2gr.s $a0, $fa0
+; LA32F-ILP32D-NEXT:    lu12i.w $a1, -16
+; LA32F-ILP32D-NEXT:    or $a0, $a0, $a1
+; LA32F-ILP32D-NEXT:    movgr2fr.w $fa0, $a0
+; LA32F-ILP32D-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32F-ILP32D-NEXT:    addi.w $sp, $sp, 16
+; LA32F-ILP32D-NEXT:    ret
+;
+; LA32D-ILP32D-LABEL: bfloat_load:
+; LA32D-ILP32D:       # %bb.0:
+; LA32D-ILP32D-NEXT:    addi.w $sp, $sp, -16
+; LA32D-ILP32D-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32D-ILP32D-NEXT:    ld.hu $a1, $a0, 6
+; LA32D-ILP32D-NEXT:    ld.hu $a0, $a0, 0
+; LA32D-ILP32D-NEXT:    slli.w $a1, $a1, 16
+; LA32D-ILP32D-NEXT:    movgr2fr.w $fa0, $a1
+; LA32D-ILP32D-NEXT:    slli.w $a0, $a0, 16
+; LA32D-ILP32D-NEXT:    movgr2fr.w $fa1, $a0
+; LA32D-ILP32D-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA32D-ILP32D-NEXT:    bl __truncsfbf2
+; LA32D-ILP32D-NEXT:    movfr2gr.s $a0, $fa0
+; LA32D-ILP32D-NEXT:    lu12i.w $a1, -16
+; LA32D-ILP32D-NEXT:    or $a0, $a0, $a1
+; LA32D-ILP32D-NEXT:    movgr2fr.w $fa0, $a0
+; LA32D-ILP32D-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32D-ILP32D-NEXT:    addi.w $sp, $sp, 16
+; LA32D-ILP32D-NEXT:    ret
+;
+; LA64F-LP64S-LABEL: bfloat_load:
+; LA64F-LP64S:       # %bb.0:
+; LA64F-LP64S-NEXT:    addi.d $sp, $sp, -16
+; LA64F-LP64S-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64F-LP64S-NEXT:    ld.hu $a1, $a0, 6
+; LA64F-LP64S-NEXT:    ld.hu $a0, $a0, 0
+; LA64F-LP64S-NEXT:    slli.d $a1, $a1, 16
+; LA64F-LP64S-NEXT:    movgr2fr.w $fa0, $a1
+; LA64F-LP64S-NEXT:    slli.d $a0, $a0, 16
+; LA64F-LP64S-NEXT:    movgr2fr.w $fa1, $a0
+; LA64F-LP64S-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA64F-LP64S-NEXT:    movfr2gr.s $a0, $fa0
+; LA64F-LP64S-NEXT:    pcaddu18i $ra, %call36(__truncsfbf2)
+; LA64F-LP64S-NEXT:    jirl $ra, $ra, 0
+; LA64F-LP64S-NEXT:    lu12i.w $a1, -16
+; LA64F-LP64S-NEXT:    or $a0, $a0, $a1
+; LA64F-LP64S-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64F-LP64S-NEXT:    addi.d $sp, $sp, 16
+; LA64F-LP64S-NEXT:    ret
+;
+; LA64F-LP64D-LABEL: bfloat_load:
+; LA64F-LP64D:       # %bb.0:
+; LA64F-LP64D-NEXT:    addi.d $sp, $sp, -16
+; LA64F-LP64D-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64F-LP64D-NEXT:    ld.hu $a1, $a0, 6
+; LA64F-LP64D-NEXT:    ld.hu $a0, $a0, 0
+; LA64F-LP64D-NEXT:    slli.d $a1, $a1, 16
+; LA64F-LP64D-NEXT:    movgr2fr.w $fa0, $a1
+; LA64F-LP64D-NEXT:    slli.d $a0, $a0, 16
+; LA64F-LP64D-NEXT:    movgr2fr.w $fa1, $a0
+; LA64F-LP64D-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA64F-LP64D-NEXT:    pcaddu18i $ra, %call36(__truncsfbf2)
+; LA64F-LP64D-NEXT:    jirl $ra, $ra, 0
+; LA64F-LP64D-NEXT:    movfr2gr.s $a0, $fa0
+; LA64F-LP64D-NEXT:    lu12i.w $a1, -16
+; LA64F-LP64D-NEXT:    or $a0, $a0, $a1
+; LA64F-LP64D-NEXT:    movgr2fr.w $fa0, $a0
+; LA64F-LP64D-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64F-LP64D-NEXT:    addi.d $sp, $sp, 16
+; LA64F-LP64D-NEXT:    ret
+;
+; LA64D-LP64S-LABEL: bfloat_load:
+; LA64D-LP64S:       # %bb.0:
+; LA64D-LP64S-NEXT:    addi.d $sp, $sp, -16
+; LA64D-LP64S-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64D-LP64S-NEXT:    ld.hu $a1, $a0, 6
+; LA64D-LP64S-NEXT:    ld.hu $a0, $a0, 0
+; LA64D-LP64S-NEXT:    slli.d $a1, $a1, 16
+; LA64D-LP64S-NEXT:    movgr2fr.w $fa0, $a1
+; LA64D-LP64S-NEXT:    slli.d $a0, $a0, 16
+; LA64D-LP64S-NEXT:    movgr2fr.w $fa1, $a0
+; LA64D-LP64S-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA64D-LP64S-NEXT:    movfr2gr.s $a0, $fa0
+; LA64D-LP64S-NEXT:    pcaddu18i $ra, %call36(__truncsfbf2)
+; LA64D-LP64S-NEXT:    jirl $ra, $ra, 0
+; LA64D-LP64S-NEXT:    lu12i.w $a1, -16
+; LA64D-LP64S-NEXT:    or $a0, $a0, $a1
+; LA64D-LP64S-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64D-LP64S-NEXT:    addi.d $sp, $sp, 16
+; LA64D-LP64S-NEXT:    ret
+;
+; LA64D-LP64D-LABEL: bfloat_load:
+; LA64D-LP64D:       # %bb.0:
+; LA64D-LP64D-NEXT:    addi.d $sp, $sp, -16
+; LA64D-LP64D-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64D-LP64D-NEXT:    ld.hu $a1, $a0, 6
+; LA64D-LP64D-NEXT:    ld.hu $a0, $a0, 0
+; LA64D-LP64D-NEXT:    slli.d $a1, $a1, 16
+; LA64D-LP64D-NEXT:    movgr2fr.w $fa0, $a1
+; LA64D-LP64D-NEXT:    slli.d $a0, $a0, 16
+; LA64D-LP64D-NEXT:    movgr2fr.w $fa1, $a0
+; LA64D-LP64D-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA64D-LP64D-NEXT:    pcaddu18i $ra, %call36(__truncsfbf2)
+; LA64D-LP64D-NEXT:    jirl $ra, $ra, 0
+; LA64D-LP64D-NEXT:    movfr2gr.s $a0, $fa0
+; LA64D-LP64D-NEXT:    lu12i.w $a1, -16
+; LA64D-LP64D-NEXT:    or $a0, $a0, $a1
+; LA64D-LP64D-NEXT:    movgr2fr.w $fa0, $a0
+; LA64D-LP64D-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64D-LP64D-NEXT:    addi.d $sp, $sp, 16
+; LA64D-LP64D-NEXT:    ret
+  %1 = load bfloat, ptr %a
+  %2 = getelementptr bfloat, ptr %a, i32 3
+  %3 = load bfloat, ptr %2
+  %4 = fadd bfloat %1, %3
+  ret bfloat %4
+}
+
+define void @bfloat_store(ptr %a, bfloat %b, bfloat %c) nounwind {
+; LA32-LABEL: bfloat_store:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 8 # 4-byte Folded Spill
+; LA32-NEXT:    move $fp, $a0
+; LA32-NEXT:    slli.w $a0, $a1, 16
+; LA32-NEXT:    slli.w $a1, $a2, 16
+; LA32-NEXT:    bl __addsf3
+; LA32-NEXT:    bl __truncsfbf2
+; LA32-NEXT:    st.h $a0, $fp, 0
+; LA32-NEXT:    st.h $a0, $fp, 16
+; LA32-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: bfloat_store:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; LA64-NEXT:    move $fp, $a0
+; LA64-NEXT:    movfr2gr.s $a0, $fa0
+; LA64-NEXT:    movfr2gr.s $a1, $fa1
+; LA64-NEXT:    slli.d $a1, $a1, 16
+; LA64-NEXT:    movgr2fr.w $fa0, $a1
+; LA64-NEXT:    slli.d $a0, $a0, 16
+; LA64-NEXT:    movgr2fr.w $fa1, $a0
+; LA64-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA64-NEXT:    pcaddu18i $ra, %call36(__truncsfbf2)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    movfr2gr.s $a0, $fa0
+; LA64-NEXT:    st.h $a0, $fp, 0
+; LA64-NEXT:    st.h $a0, $fp, 16
+; LA64-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+;
+; LA32F-ILP32S-LABEL: bfloat_store:
+; LA32F-ILP32S:       # %bb.0:
+; LA32F-ILP32S-NEXT:    addi.w $sp, $sp, -16
+; LA32F-ILP32S-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32F-ILP32S-NEXT:    st.w $fp, $sp, 8 # 4-byte Folded Spill
+; LA32F-ILP32S-NEXT:    move $fp, $a0
+; LA32F-ILP32S-NEXT:    slli.w $a0, $a2, 16
+; LA32F-ILP32S-NEXT:    movgr2fr.w $fa0, $a0
+; LA32F-ILP32S-NEXT:    slli.w $a0, $a1, 16
+; LA32F-ILP32S-NEXT:    movgr2fr.w $fa1, $a0
+; LA32F-ILP32S-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA32F-ILP32S-NEXT:    movfr2gr.s $a0, $fa0
+; LA32F-ILP32S-NEXT:    bl __truncsfbf2
+; LA32F-ILP32S-NEXT:    st.h $a0, $fp, 0
+; LA32F-ILP32S-NEXT:    st.h $a0, $fp, 16
+; LA32F-ILP32S-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
+; LA32F-ILP32S-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32F-ILP32S-NEXT:    addi.w $sp, $sp, 16
+; LA32F-ILP32S-NEXT:    ret
+;
+; LA32F-ILP32D-LABEL: bfloat_store:
+; LA32F-ILP32D:       # %bb.0:
+; LA32F-ILP32D-NEXT:    addi.w $sp, $sp, -16
+; LA32F-ILP32D-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32F-ILP32D-NEXT:    st.w $fp, $sp, 8 # 4-byte Folded Spill
+; LA32F-ILP32D-NEXT:    move $fp, $a0
+; LA32F-ILP32D-NEXT:    movfr2gr.s $a0, $fa0
+; LA32F-ILP32D-NEXT:    movfr2gr.s $a1, $fa1
+; LA32F-ILP32D-NEXT:    slli.w $a1, $a1, 16
+; LA32F-ILP32D-NEXT:    movgr2fr.w $fa0, $a1
+; LA32F-ILP32D-NEXT:    slli.w $a0, $a0, 16
+; LA32F-ILP32D-NEXT:    movgr2fr.w $fa1, $a0
+; LA32F-ILP32D-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA32F-ILP32D-NEXT:    bl __truncsfbf2
+; LA32F-ILP32D-NEXT:    movfr2gr.s $a0, $fa0
+; LA32F-ILP32D-NEXT:    st.h $a0, $fp, 0
+; LA32F-ILP32D-NEXT:    st.h $a0, $fp, 16
+; LA32F-ILP32D-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
+; LA32F-ILP32D-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32F-ILP32D-NEXT:    addi.w $sp, $sp, 16
+; LA32F-ILP32D-NEXT:    ret
+;
+; LA32D-ILP32D-LABEL: bfloat_store:
+; LA32D-ILP32D:       # %bb.0:
+; LA32D-ILP32D-NEXT:    addi.w $sp, $sp, -16
+; LA32D-ILP32D-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32D-ILP32D-NEXT:    st.w $fp, $sp, 8 # 4-byte Folded Spill
+; LA32D-ILP32D-NEXT:    move $fp, $a0
+; LA32D-ILP32D-NEXT:    movfr2gr.s $a0, $fa0
+; LA32D-ILP32D-NEXT:    movfr2gr.s $a1, $fa1
+; LA32D-ILP32D-NEXT:    slli.w $a1, $a1, 16
+; LA32D-ILP32D-NEXT:    movgr2fr.w $fa0, $a1
+; LA32D-ILP32D-NEXT:    slli.w $a0, $a0, 16
+; LA32D-ILP32D-NEXT:    movgr2fr.w $fa1, $a0
+; LA32D-ILP32D-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA32D-ILP32D-NEXT:    bl __truncsfbf2
+; LA32D-ILP32D-NEXT:    movfr2gr.s $a0, $fa0
+; LA32D-ILP32D-NEXT:    st.h $a0, $fp, 0
+; LA32D-ILP32D-NEXT:    st.h $a0, $fp, 16
+; LA32D-ILP32D-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
+; LA32D-ILP32D-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32D-ILP32D-NEXT:    addi.w $sp, $sp, 16
+; LA32D-ILP32D-NEXT:    ret
+;
+; LA64F-LP64S-LABEL: bfloat_store:
+; LA64F-LP64S:       # %bb.0:
+; LA64F-LP64S-NEXT:    addi.d $sp, $sp, -16
+; LA64F-LP64S-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64F-LP64S-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; LA64F-LP64S-NEXT:    move $fp, $a0
+; LA64F-LP64S-NEXT:    slli.d $a0, $a2, 16
+; LA64F-LP64S-NEXT:    movgr2fr.w $fa0, $a0
+; LA64F-LP64S-NEXT:    slli.d $a0, $a1, 16
+; LA64F-LP64S-NEXT:    movgr2fr.w $fa1, $a0
+; LA64F-LP64S-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA64F-LP64S-NEXT:    movfr2gr.s $a0, $fa0
+; LA64F-LP64S-NEXT:    pcaddu18i $ra, %call36(__truncsfbf2)
+; LA64F-LP64S-NEXT:    jirl $ra, $ra, 0
+; LA64F-LP64S-NEXT:    st.h $a0, $fp, 0
+; LA64F-LP64S-NEXT:    st.h $a0, $fp, 16
+; LA64F-LP64S-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; LA64F-LP64S-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64F-LP64S-NEXT:    addi.d $sp, $sp, 16
+; LA64F-LP64S-NEXT:    ret
+;
+; LA64F-LP64D-LABEL: bfloat_store:
+; LA64F-LP64D:       # %bb.0:
+; LA64F-LP64D-NEXT:    addi.d $sp, $sp, -16
+; LA64F-LP64D-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64F-LP64D-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; LA64F-LP64D-NEXT:    move $fp, $a0
+; LA64F-LP64D-NEXT:    movfr2gr.s $a0, $fa0
+; LA64F-LP64D-NEXT:    movfr2gr.s $a1, $fa1
+; LA64F-LP64D-NEXT:    slli.d $a1, $a1, 16
+; LA64F-LP64D-NEXT:    movgr2fr.w $fa0, $a1
+; LA64F-LP64D-NEXT:    slli.d $a0, $a0, 16
+; LA64F-LP64D-NEXT:    movgr2fr.w $fa1, $a0
+; LA64F-LP64D-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA64F-LP64D-NEXT:    pcaddu18i $ra, %call36(__truncsfbf2)
+; LA64F-LP64D-NEXT:    jirl $ra, $ra, 0
+; LA64F-LP64D-NEXT:    movfr2gr.s $a0, $fa0
+; LA64F-LP64D-NEXT:    st.h $a0, $fp, 0
+; LA64F-LP64D-NEXT:    st.h $a0, $fp, 16
+; LA64F-LP64D-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; LA64F-LP64D-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64F-LP64D-NEXT:    addi.d $sp, $sp, 16
+; LA64F-LP64D-NEXT:    ret
+;
+; LA64D-LP64S-LABEL: bfloat_store:
+; LA64D-LP64S:       # %bb.0:
+; LA64D-LP64S-NEXT:    addi.d $sp, $sp, -16
+; LA64D-LP64S-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64D-LP64S-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; LA64D-LP64S-NEXT:    move $fp, $a0
+; LA64D-LP64S-NEXT:    slli.d $a0, $a2, 16
+; LA64D-LP64S-NEXT:    movgr2fr.w $fa0, $a0
+; LA64D-LP64S-NEXT:    slli.d $a0, $a1, 16
+; LA64D-LP64S-NEXT:    movgr2fr.w $fa1, $a0
+; LA64D-LP64S-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA64D-LP64S-NEXT:    movfr2gr.s $a0, $fa0
+; LA64D-LP64S-NEXT:    pcaddu18i $ra, %call36(__truncsfbf2)
+; LA64D-LP64S-NEXT:    jirl $ra, $ra, 0
+; LA64D-LP64S-NEXT:    st.h $a0, $fp, 0
+; LA64D-LP64S-NEXT:    st.h $a0, $fp, 16
+; LA64D-LP64S-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; LA64D-LP64S-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64D-LP64S-NEXT:    addi.d $sp, $sp, 16
+; LA64D-LP64S-NEXT:    ret
+;
+; LA64D-LP64D-LABEL: bfloat_store:
+; LA64D-LP64D:       # %bb.0:
+; LA64D-LP64D-NEXT:    addi.d $sp, $sp, -16
+; LA64D-LP64D-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64D-LP64D-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; LA64D-LP64D-NEXT:    move $fp, $a0
+; LA64D-LP64D-NEXT:    movfr2gr.s $a0, $fa0
+; LA64D-LP64D-NEXT:    movfr2gr.s $a1, $fa1
+; LA64D-LP64D-NEXT:    slli.d $a1, $a1, 16
+; LA64D-LP64D-NEXT:    movgr2fr.w $fa0, $a1
+; LA64D-LP64D-NEXT:    slli.d $a0, $a0, 16
+; LA64D-LP64D-NEXT:    movgr2fr.w $fa1, $a0
+; LA64D-LP64D-NEXT:    fadd.s $fa0, $fa1, $fa0
+; LA64D-LP64D-NEXT:    pcaddu18i $ra, %call36(__truncsfbf2)
+; LA64D-LP64D-NEXT:    jirl $ra, $ra, 0
+; LA64D-LP64D-NEXT:    movfr2gr.s $a0, $fa0
+; LA64D-LP64D-NEXT:    st.h $a0, $fp, 0
+; LA64D-LP64D-NEXT:    st.h $a0, $fp, 16
+; LA64D-LP64D-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; LA64D-LP64D-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64D-LP64D-NEXT:    addi.d $sp, $sp, 16
+; LA64D-LP64D-NEXT:    ret
+  %1 = fadd bfloat %b, %c
+  store bfloat %1, ptr %a
+  %2 = getelementptr bfloat, ptr %a, i32 8
+  store bfloat %1, ptr %2
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
index 95fc947..0e172950 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
@@ -4,76 +4,14 @@
 define <32 x i8> @shuffle_v32i8(<32 x i8> %a) {
 ; CHECK-LABEL: shuffle_v32i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_offset 1, -8
-; CHECK-NEXT:    .cfi_offset 22, -16
-; CHECK-NEXT:    addi.d $fp, $sp, 64
-; CHECK-NEXT:    .cfi_def_cfa 22, 0
-; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    ld.h $a0, $sp, 16
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 0
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr1, 0
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 1
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr1, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 2
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr1, 2
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 3
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr1, 3
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 4
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr1, 4
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 5
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr1, 5
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 6
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr1, 6
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 7
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a0, 0
-; CHECK-NEXT:    ld.h $a0, $sp, 18
-; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a0, 1
-; CHECK-NEXT:    ld.h $a0, $sp, 20
-; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a0, 2
-; CHECK-NEXT:    ld.h $a0, $sp, 22
-; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a0, 3
-; CHECK-NEXT:    ld.h $a0, $sp, 24
-; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a0, 4
-; CHECK-NEXT:    ld.h $a0, $sp, 26
-; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a0, 5
-; CHECK-NEXT:    ld.h $a0, $sp, 28
-; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a0, 6
-; CHECK-NEXT:    ld.h $a0, $sp, 30
-; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a0, 7
-; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
+; CHECK-NEXT:    xvld $xr2, $a0, %pc_lo12(.LCPI0_0)
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_1)
+; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI0_1)
+; CHECK-NEXT:    xvpermi.d $xr3, $xr0, 78
+; CHECK-NEXT:    xvshuf.d $xr2, $xr0, $xr3
+; CHECK-NEXT:    xvshuf.h $xr1, $xr2, $xr0
+; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> poison, <32 x i32> <i32 16, i32 17, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   ret <32 x i8> %shuffle
@@ -83,21 +21,13 @@ define <32 x i8> @shuffle_v32i8(<32 x i8> %a) {
 define <16 x i16> @shuffle_v16i16(<16 x i16> %a) {
 ; CHECK-LABEL: shuffle_v16i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 4
-; CHECK-NEXT:    xvinsgr2vr.w $xr1, $a0, 0
-; CHECK-NEXT:    xvpickve2gr.w $a1, $xr0, 0
-; CHECK-NEXT:    xvinsgr2vr.w $xr1, $a1, 1
-; CHECK-NEXT:    xvpickve2gr.w $a1, $xr0, 1
-; CHECK-NEXT:    xvinsgr2vr.w $xr1, $a1, 2
-; CHECK-NEXT:    xvpickve2gr.w $a1, $xr0, 2
-; CHECK-NEXT:    xvinsgr2vr.w $xr1, $a1, 3
-; CHECK-NEXT:    xvinsgr2vr.w $xr1, $a0, 4
-; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 5
-; CHECK-NEXT:    xvinsgr2vr.w $xr1, $a0, 5
-; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 6
-; CHECK-NEXT:    xvinsgr2vr.w $xr1, $a0, 6
-; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 7
-; CHECK-NEXT:    xvinsgr2vr.w $xr1, $a0, 7
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
+; CHECK-NEXT:    xvld $xr2, $a0, %pc_lo12(.LCPI1_0)
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_1)
+; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI1_1)
+; CHECK-NEXT:    xvpermi.d $xr3, $xr0, 78
+; CHECK-NEXT:    xvshuf.d $xr2, $xr0, $xr3
+; CHECK-NEXT:    xvshuf.w $xr1, $xr2, $xr0
 ; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> poison, <16 x i32> <i32 8, i32 9, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -107,13 +37,13 @@ define <16 x i16> @shuffle_v16i16(<16 x i16> %a) {
 define <8 x i32> @shuffle_v8i32(<8 x i32> %a) {
 ; CHECK-LABEL: shuffle_v8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 2
-; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 0
-; CHECK-NEXT:    xvpickve2gr.d $a1, $xr0, 0
-; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a1, 1
-; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 2
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 3
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
+; CHECK-NEXT:    xvld $xr2, $a0, %pc_lo12(.LCPI2_0)
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_1)
+; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI2_1)
+; CHECK-NEXT:    xvpermi.d $xr3, $xr0, 78
+; CHECK-NEXT:    xvshuf.d $xr2, $xr0, $xr3
+; CHECK-NEXT:    xvshuf.d $xr1, $xr2, $xr0
 ; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
@@ -123,14 +53,13 @@ define <8 x i32> @shuffle_v8i32(<8 x i32> %a) {
 define <4 x i64> @shuffle_v4i64(<4 x i64> %a) {
 ; CHECK-LABEL: shuffle_v4i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 0
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 1
-; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 1
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 2
-; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 2
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 0
-; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 3
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
+; CHECK-NEXT:    xvld $xr2, $a0, %pc_lo12(.LCPI3_0)
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_1)
+; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI3_1)
+; CHECK-NEXT:    xvpermi.d $xr3, $xr0, 78
+; CHECK-NEXT:    xvshuf.d $xr2, $xr0, $xr3
+; CHECK-NEXT:    xvshuf.d $xr1, $xr2, $xr0
 ; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
@@ -140,19 +69,13 @@ define <4 x i64> @shuffle_v4i64(<4 x i64> %a) {
 define <8 x float> @shuffle_v8f32(<8 x float> %a) {
 ; CHECK-LABEL: shuffle_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 2
-; CHECK-NEXT:    movgr2fr.d $fa1, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa1
-; CHECK-NEXT:    xvpickve2gr.d $a1, $xr0, 0
-; CHECK-NEXT:    movgr2fr.d $fa2, $a1
-; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 0
-; CHECK-NEXT:    movfr2gr.d $a1, $fa2
-; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a1, 1
-; CHECK-NEXT:    xvpickve2gr.d $a1, $xr0, 3
-; CHECK-NEXT:    movgr2fr.d $fa0, $a1
-; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 2
-; CHECK-NEXT:    movfr2gr.d $a0, $fa0
-; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 3
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
+; CHECK-NEXT:    xvld $xr2, $a0, %pc_lo12(.LCPI4_0)
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_1)
+; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI4_1)
+; CHECK-NEXT:    xvpermi.d $xr3, $xr0, 78
+; CHECK-NEXT:    xvshuf.d $xr2, $xr0, $xr3
+; CHECK-NEXT:    xvshuf.d $xr1, $xr2, $xr0
 ; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
@@ -162,22 +85,13 @@ define <8 x float> @shuffle_v8f32(<8 x float> %a) {
 define <4 x double> @shuffle_v4f64(<4 x double> %a) {
 ; CHECK-LABEL: shuffle_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT:    movgr2fr.d $fa1, $a0
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 1
-; CHECK-NEXT:    movgr2fr.d $fa2, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa1
-; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa2
-; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 1
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 2
-; CHECK-NEXT:    movgr2fr.d $fa2, $a0
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 0
-; CHECK-NEXT:    movgr2fr.d $fa0, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa2
-; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 2
-; CHECK-NEXT:    movfr2gr.d $a0, $fa0
-; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 3
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
+; CHECK-NEXT:    xvld $xr2, $a0, %pc_lo12(.LCPI5_0)
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_1)
+; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI5_1)
+; CHECK-NEXT:    xvpermi.d $xr3, $xr0, 78
+; CHECK-NEXT:    xvshuf.d $xr2, $xr0, $xr3
+; CHECK-NEXT:    xvshuf.d $xr1, $xr2, $xr0
 ; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
diff --git a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
index 8e4c77e..bac8bbb 100644
--- a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
@@ -8,17 +8,16 @@
 define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v1i8(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<8>;
+; CHECK-NEXT:    .reg .b16 %rs<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b8 %rs1, [out_v1i8_param_0];
-; CHECK-NEXT:    ld.param.b8 %rs2, [out_v1i8_param_2];
-; CHECK-NEXT:    and.b16 %rs3, %rs1, %rs2;
-; CHECK-NEXT:    ld.param.b8 %rs4, [out_v1i8_param_1];
-; CHECK-NEXT:    not.b16 %rs5, %rs2;
-; CHECK-NEXT:    and.b16 %rs6, %rs4, %rs5;
-; CHECK-NEXT:    or.b16 %rs7, %rs3, %rs6;
-; CHECK-NEXT:    st.param.b8 [func_retval0], %rs7;
+; CHECK-NEXT:    ld.param.b8 %rs2, [out_v1i8_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs3, [out_v1i8_param_2];
+; CHECK-NEXT:    xor.b16 %rs4, %rs1, %rs2;
+; CHECK-NEXT:    and.b16 %rs5, %rs4, %rs3;
+; CHECK-NEXT:    xor.b16 %rs6, %rs5, %rs2;
+; CHECK-NEXT:    st.param.b8 [func_retval0], %rs6;
 ; CHECK-NEXT:    ret;
   %mx = and <1 x i8> %x, %mask
   %notmask = xor <1 x i8> %mask, <i8 -1>
@@ -34,17 +33,16 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v1i16(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<8>;
+; CHECK-NEXT:    .reg .b16 %rs<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b16 %rs1, [out_v1i16_param_0];
-; CHECK-NEXT:    ld.param.b16 %rs2, [out_v1i16_param_2];
-; CHECK-NEXT:    and.b16 %rs3, %rs1, %rs2;
-; CHECK-NEXT:    ld.param.b16 %rs4, [out_v1i16_param_1];
-; CHECK-NEXT:    not.b16 %rs5, %rs2;
-; CHECK-NEXT:    and.b16 %rs6, %rs4, %rs5;
-; CHECK-NEXT:    or.b16 %rs7, %rs3, %rs6;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs7;
+; CHECK-NEXT:    ld.param.b16 %rs2, [out_v1i16_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [out_v1i16_param_2];
+; CHECK-NEXT:    xor.b16 %rs4, %rs1, %rs2;
+; CHECK-NEXT:    and.b16 %rs5, %rs4, %rs3;
+; CHECK-NEXT:    xor.b16 %rs6, %rs5, %rs2;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs6;
 ; CHECK-NEXT:    ret;
   %mx = and <1 x i16> %x, %mask
   %notmask = xor <1 x i16> %mask, <i16 -1>
@@ -126,17 +124,16 @@ define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwin
 define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v1i32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [out_v1i32_param_0];
-; CHECK-NEXT:    ld.param.b32 %r2, [out_v1i32_param_2];
-; CHECK-NEXT:    and.b32 %r3, %r1, %r2;
-; CHECK-NEXT:    ld.param.b32 %r4, [out_v1i32_param_1];
-; CHECK-NEXT:    not.b32 %r5, %r2;
-; CHECK-NEXT:    and.b32 %r6, %r4, %r5;
-; CHECK-NEXT:    or.b32 %r7, %r3, %r6;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
+; CHECK-NEXT:    ld.param.b32 %r2, [out_v1i32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [out_v1i32_param_2];
+; CHECK-NEXT:    xor.b32 %r4, %r1, %r2;
+; CHECK-NEXT:    and.b32 %r5, %r4, %r3;
+; CHECK-NEXT:    xor.b32 %r6, %r5, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
   %mx = and <1 x i32> %x, %mask
   %notmask = xor <1 x i32> %mask, <i32 -1>
@@ -230,21 +227,19 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n
 define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v2i32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<15>;
+; CHECK-NEXT:    .reg .b32 %r<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [out_v2i32_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [out_v2i32_param_2];
-; CHECK-NEXT:    and.b32 %r5, %r1, %r3;
-; CHECK-NEXT:    and.b32 %r6, %r2, %r4;
-; CHECK-NEXT:    ld.param.v2.b32 {%r7, %r8}, [out_v2i32_param_1];
-; CHECK-NEXT:    not.b32 %r9, %r4;
-; CHECK-NEXT:    not.b32 %r10, %r3;
-; CHECK-NEXT:    and.b32 %r11, %r7, %r10;
-; CHECK-NEXT:    and.b32 %r12, %r8, %r9;
-; CHECK-NEXT:    or.b32 %r13, %r6, %r12;
-; CHECK-NEXT:    or.b32 %r14, %r5, %r11;
-; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r14, %r13};
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [out_v2i32_param_1];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [out_v2i32_param_2];
+; CHECK-NEXT:    xor.b32 %r7, %r2, %r4;
+; CHECK-NEXT:    and.b32 %r8, %r7, %r6;
+; CHECK-NEXT:    xor.b32 %r9, %r8, %r4;
+; CHECK-NEXT:    xor.b32 %r10, %r1, %r3;
+; CHECK-NEXT:    and.b32 %r11, %r10, %r5;
+; CHECK-NEXT:    xor.b32 %r12, %r11, %r3;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r12, %r9};
 ; CHECK-NEXT:    ret;
   %mx = and <2 x i32> %x, %mask
   %notmask = xor <2 x i32> %mask, <i32 -1, i32 -1>
@@ -256,17 +251,16 @@ define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwin
 define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
 ; CHECK-LABEL: out_v1i64(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<8>;
+; CHECK-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [out_v1i64_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd2, [out_v1i64_param_2];
-; CHECK-NEXT:    and.b64 %rd3, %rd1, %rd2;
-; CHECK-NEXT:    ld.param.b64 %rd4, [out_v1i64_param_1];
-; CHECK-NEXT:    not.b64 %rd5, %rd2;
-; CHECK-NEXT:    and.b64 %rd6, %rd4, %rd5;
-; CHECK-NEXT:    or.b64 %rd7, %rd3, %rd6;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %rd7;
+; CHECK-NEXT:    ld.param.b64 %rd2, [out_v1i64_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd3, [out_v1i64_param_2];
+; CHECK-NEXT:    xor.b64 %rd4, %rd1, %rd2;
+; CHECK-NEXT:    and.b64 %rd5, %rd4, %rd3;
+; CHECK-NEXT:    xor.b64 %rd6, %rd5, %rd2;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
 ; CHECK-NEXT:    ret;
   %mx = and <1 x i64> %x, %mask
   %notmask = xor <1 x i64> %mask, <i64 -1>
@@ -350,29 +344,25 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v4i32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<29>;
+; CHECK-NEXT:    .reg .b32 %r<25>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [out_v4i32_param_0];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_param_2];
-; CHECK-NEXT:    and.b32 %r9, %r1, %r5;
-; CHECK-NEXT:    and.b32 %r10, %r2, %r6;
-; CHECK-NEXT:    and.b32 %r11, %r3, %r7;
-; CHECK-NEXT:    and.b32 %r12, %r4, %r8;
-; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [out_v4i32_param_1];
-; CHECK-NEXT:    not.b32 %r17, %r8;
-; CHECK-NEXT:    not.b32 %r18, %r7;
-; CHECK-NEXT:    not.b32 %r19, %r6;
-; CHECK-NEXT:    not.b32 %r20, %r5;
-; CHECK-NEXT:    and.b32 %r21, %r13, %r20;
-; CHECK-NEXT:    and.b32 %r22, %r14, %r19;
-; CHECK-NEXT:    and.b32 %r23, %r15, %r18;
-; CHECK-NEXT:    and.b32 %r24, %r16, %r17;
-; CHECK-NEXT:    or.b32 %r25, %r12, %r24;
-; CHECK-NEXT:    or.b32 %r26, %r11, %r23;
-; CHECK-NEXT:    or.b32 %r27, %r10, %r22;
-; CHECK-NEXT:    or.b32 %r28, %r9, %r21;
-; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r28, %r27, %r26, %r25};
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_param_1];
+; CHECK-NEXT:    ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [out_v4i32_param_2];
+; CHECK-NEXT:    xor.b32 %r13, %r4, %r8;
+; CHECK-NEXT:    and.b32 %r14, %r13, %r12;
+; CHECK-NEXT:    xor.b32 %r15, %r14, %r8;
+; CHECK-NEXT:    xor.b32 %r16, %r3, %r7;
+; CHECK-NEXT:    and.b32 %r17, %r16, %r11;
+; CHECK-NEXT:    xor.b32 %r18, %r17, %r7;
+; CHECK-NEXT:    xor.b32 %r19, %r2, %r6;
+; CHECK-NEXT:    and.b32 %r20, %r19, %r10;
+; CHECK-NEXT:    xor.b32 %r21, %r20, %r6;
+; CHECK-NEXT:    xor.b32 %r22, %r1, %r5;
+; CHECK-NEXT:    and.b32 %r23, %r22, %r9;
+; CHECK-NEXT:    xor.b32 %r24, %r23, %r5;
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r24, %r21, %r18, %r15};
 ; CHECK-NEXT:    ret;
   %mx = and <4 x i32> %x, %mask
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -384,26 +374,23 @@ define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwin
 define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v4i32_undef(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<26>;
+; CHECK-NEXT:    .reg .b32 %r<23>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [out_v4i32_undef_param_0];
 ; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_undef_param_2];
 ; CHECK-NEXT:    and.b32 %r9, %r3, %r7;
-; CHECK-NEXT:    and.b32 %r10, %r1, %r5;
-; CHECK-NEXT:    and.b32 %r11, %r2, %r6;
-; CHECK-NEXT:    and.b32 %r12, %r4, %r8;
-; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [out_v4i32_undef_param_1];
-; CHECK-NEXT:    not.b32 %r17, %r8;
-; CHECK-NEXT:    not.b32 %r18, %r6;
-; CHECK-NEXT:    not.b32 %r19, %r5;
-; CHECK-NEXT:    and.b32 %r20, %r13, %r19;
-; CHECK-NEXT:    and.b32 %r21, %r14, %r18;
-; CHECK-NEXT:    and.b32 %r22, %r16, %r17;
-; CHECK-NEXT:    or.b32 %r23, %r12, %r22;
-; CHECK-NEXT:    or.b32 %r24, %r11, %r21;
-; CHECK-NEXT:    or.b32 %r25, %r10, %r20;
-; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r25, %r24, %r9, %r23};
+; CHECK-NEXT:    ld.param.v4.b32 {%r10, %r11, %r12, %r13}, [out_v4i32_undef_param_1];
+; CHECK-NEXT:    xor.b32 %r14, %r4, %r13;
+; CHECK-NEXT:    and.b32 %r15, %r14, %r8;
+; CHECK-NEXT:    xor.b32 %r16, %r15, %r13;
+; CHECK-NEXT:    xor.b32 %r17, %r2, %r11;
+; CHECK-NEXT:    and.b32 %r18, %r17, %r6;
+; CHECK-NEXT:    xor.b32 %r19, %r18, %r11;
+; CHECK-NEXT:    xor.b32 %r20, %r1, %r10;
+; CHECK-NEXT:    and.b32 %r21, %r20, %r5;
+; CHECK-NEXT:    xor.b32 %r22, %r21, %r10;
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r22, %r19, %r9, %r16};
 ; CHECK-NEXT:    ret;
   %mx = and <4 x i32> %x, %mask
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 undef, i32 -1>
@@ -415,21 +402,19 @@ define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) n
 define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
 ; CHECK-LABEL: out_v2i64(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<15>;
+; CHECK-NEXT:    .reg .b64 %rd<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [out_v2i64_param_0];
-; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [out_v2i64_param_2];
-; CHECK-NEXT:    and.b64 %rd5, %rd1, %rd3;
-; CHECK-NEXT:    and.b64 %rd6, %rd2, %rd4;
-; CHECK-NEXT:    ld.param.v2.b64 {%rd7, %rd8}, [out_v2i64_param_1];
-; CHECK-NEXT:    not.b64 %rd9, %rd4;
-; CHECK-NEXT:    not.b64 %rd10, %rd3;
-; CHECK-NEXT:    and.b64 %rd11, %rd7, %rd10;
-; CHECK-NEXT:    and.b64 %rd12, %rd8, %rd9;
-; CHECK-NEXT:    or.b64 %rd13, %rd6, %rd12;
-; CHECK-NEXT:    or.b64 %rd14, %rd5, %rd11;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd14, %rd13};
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [out_v2i64_param_1];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd5, %rd6}, [out_v2i64_param_2];
+; CHECK-NEXT:    xor.b64 %rd7, %rd2, %rd4;
+; CHECK-NEXT:    and.b64 %rd8, %rd7, %rd6;
+; CHECK-NEXT:    xor.b64 %rd9, %rd8, %rd4;
+; CHECK-NEXT:    xor.b64 %rd10, %rd1, %rd3;
+; CHECK-NEXT:    and.b64 %rd11, %rd10, %rd5;
+; CHECK-NEXT:    xor.b64 %rd12, %rd11, %rd3;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd12, %rd9};
 ; CHECK-NEXT:    ret;
   %mx = and <2 x i64> %x, %mask
   %notmask = xor <2 x i64> %mask, <i64 -1, i64 -1>
diff --git a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
index 91431ed..9dd0fbe 100644
--- a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
+++ b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
@@ -3713,30 +3713,26 @@ entry:
 define <2 x i64> @spltConst1ll() {
 ; P9BE-LABEL: spltConst1ll:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    addis r3, r2, .LCPI65_0@toc@ha
-; P9BE-NEXT:    addi r3, r3, .LCPI65_0@toc@l
-; P9BE-NEXT:    lxv v2, 0(r3)
+; P9BE-NEXT:    vspltisw v2, 1
+; P9BE-NEXT:    vupklsw v2, v2
 ; P9BE-NEXT:    blr
 ;
 ; P9LE-LABEL: spltConst1ll:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    addis r3, r2, .LCPI65_0@toc@ha
-; P9LE-NEXT:    addi r3, r3, .LCPI65_0@toc@l
-; P9LE-NEXT:    lxv v2, 0(r3)
+; P9LE-NEXT:    vspltisw v2, 1
+; P9LE-NEXT:    vupklsw v2, v2
 ; P9LE-NEXT:    blr
 ;
 ; P8BE-LABEL: spltConst1ll:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    addis r3, r2, .LCPI65_0@toc@ha
-; P8BE-NEXT:    addi r3, r3, .LCPI65_0@toc@l
-; P8BE-NEXT:    lxvd2x v2, 0, r3
+; P8BE-NEXT:    vspltisw v2, 1
+; P8BE-NEXT:    vupklsw v2, v2
 ; P8BE-NEXT:    blr
 ;
 ; P8LE-LABEL: spltConst1ll:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    addis r3, r2, .LCPI65_0@toc@ha
-; P8LE-NEXT:    addi r3, r3, .LCPI65_0@toc@l
-; P8LE-NEXT:    lxvd2x v2, 0, r3
+; P8LE-NEXT:    vspltisw v2, 1
+; P8LE-NEXT:    vupklsw v2, v2
 ; P8LE-NEXT:    blr
 entry:
   ret <2 x i64> <i64 1, i64 1>
@@ -4173,30 +4169,26 @@ entry:
 define <2 x i64> @spltCnstConvftoll() {
 ; P9BE-LABEL: spltCnstConvftoll:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    addis r3, r2, .LCPI78_0@toc@ha
-; P9BE-NEXT:    addi r3, r3, .LCPI78_0@toc@l
-; P9BE-NEXT:    lxv v2, 0(r3)
+; P9BE-NEXT:    vspltisw v2, 4
+; P9BE-NEXT:    vupklsw v2, v2
 ; P9BE-NEXT:    blr
 ;
 ; P9LE-LABEL: spltCnstConvftoll:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    addis r3, r2, .LCPI78_0@toc@ha
-; P9LE-NEXT:    addi r3, r3, .LCPI78_0@toc@l
-; P9LE-NEXT:    lxv v2, 0(r3)
+; P9LE-NEXT:    vspltisw v2, 4
+; P9LE-NEXT:    vupklsw v2, v2
 ; P9LE-NEXT:    blr
 ;
 ; P8BE-LABEL: spltCnstConvftoll:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    addis r3, r2, .LCPI78_0@toc@ha
-; P8BE-NEXT:    addi r3, r3, .LCPI78_0@toc@l
-; P8BE-NEXT:    lxvd2x v2, 0, r3
+; P8BE-NEXT:    vspltisw v2, 4
+; P8BE-NEXT:    vupklsw v2, v2
 ; P8BE-NEXT:    blr
 ;
 ; P8LE-LABEL: spltCnstConvftoll:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    addis r3, r2, .LCPI78_0@toc@ha
-; P8LE-NEXT:    addi r3, r3, .LCPI78_0@toc@l
-; P8LE-NEXT:    lxvd2x v2, 0, r3
+; P8LE-NEXT:    vspltisw v2, 4
+; P8LE-NEXT:    vupklsw v2, v2
 ; P8LE-NEXT:    blr
 entry:
   ret <2 x i64> <i64 4, i64 4>
@@ -4526,30 +4518,26 @@ entry:
 define <2 x i64> @spltCnstConvdtoll() {
 ; P9BE-LABEL: spltCnstConvdtoll:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    addis r3, r2, .LCPI87_0@toc@ha
-; P9BE-NEXT:    addi r3, r3, .LCPI87_0@toc@l
-; P9BE-NEXT:    lxv v2, 0(r3)
+; P9BE-NEXT:    vspltisw v2, 4
+; P9BE-NEXT:    vupklsw v2, v2
 ; P9BE-NEXT:    blr
 ;
 ; P9LE-LABEL: spltCnstConvdtoll:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    addis r3, r2, .LCPI87_0@toc@ha
-; P9LE-NEXT:    addi r3, r3, .LCPI87_0@toc@l
-; P9LE-NEXT:    lxv v2, 0(r3)
+; P9LE-NEXT:    vspltisw v2, 4
+; P9LE-NEXT:    vupklsw v2, v2
 ; P9LE-NEXT:    blr
 ;
 ; P8BE-LABEL: spltCnstConvdtoll:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    addis r3, r2, .LCPI87_0@toc@ha
-; P8BE-NEXT:    addi r3, r3, .LCPI87_0@toc@l
-; P8BE-NEXT:    lxvd2x v2, 0, r3
+; P8BE-NEXT:    vspltisw v2, 4
+; P8BE-NEXT:    vupklsw v2, v2
 ; P8BE-NEXT:    blr
 ;
 ; P8LE-LABEL: spltCnstConvdtoll:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    addis r3, r2, .LCPI87_0@toc@ha
-; P8LE-NEXT:    addi r3, r3, .LCPI87_0@toc@l
-; P8LE-NEXT:    lxvd2x v2, 0, r3
+; P8LE-NEXT:    vspltisw v2, 4
+; P8LE-NEXT:    vupklsw v2, v2
 ; P8LE-NEXT:    blr
 entry:
   ret <2 x i64> <i64 4, i64 4>
@@ -4879,30 +4867,26 @@ entry:
 define <2 x i64> @spltConst1ull() {
 ; P9BE-LABEL: spltConst1ull:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    addis r3, r2, .LCPI97_0@toc@ha
-; P9BE-NEXT:    addi r3, r3, .LCPI97_0@toc@l
-; P9BE-NEXT:    lxv v2, 0(r3)
+; P9BE-NEXT:    vspltisw v2, 1
+; P9BE-NEXT:    vupklsw v2, v2
 ; P9BE-NEXT:    blr
 ;
 ; P9LE-LABEL: spltConst1ull:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    addis r3, r2, .LCPI97_0@toc@ha
-; P9LE-NEXT:    addi r3, r3, .LCPI97_0@toc@l
-; P9LE-NEXT:    lxv v2, 0(r3)
+; P9LE-NEXT:    vspltisw v2, 1
+; P9LE-NEXT:    vupklsw v2, v2
 ; P9LE-NEXT:    blr
 ;
 ; P8BE-LABEL: spltConst1ull:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    addis r3, r2, .LCPI97_0@toc@ha
-; P8BE-NEXT:    addi r3, r3, .LCPI97_0@toc@l
-; P8BE-NEXT:    lxvd2x v2, 0, r3
+; P8BE-NEXT:    vspltisw v2, 1
+; P8BE-NEXT:    vupklsw v2, v2
 ; P8BE-NEXT:    blr
 ;
 ; P8LE-LABEL: spltConst1ull:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    addis r3, r2, .LCPI97_0@toc@ha
-; P8LE-NEXT:    addi r3, r3, .LCPI97_0@toc@l
-; P8LE-NEXT:    lxvd2x v2, 0, r3
+; P8LE-NEXT:    vspltisw v2, 1
+; P8LE-NEXT:    vupklsw v2, v2
 ; P8LE-NEXT:    blr
 entry:
   ret <2 x i64> <i64 1, i64 1>
@@ -5339,30 +5323,26 @@ entry:
 define <2 x i64> @spltCnstConvftoull() {
 ; P9BE-LABEL: spltCnstConvftoull:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    addis r3, r2, .LCPI110_0@toc@ha
-; P9BE-NEXT:    addi r3, r3, .LCPI110_0@toc@l
-; P9BE-NEXT:    lxv v2, 0(r3)
+; P9BE-NEXT:    vspltisw v2, 4
+; P9BE-NEXT:    vupklsw v2, v2
 ; P9BE-NEXT:    blr
 ;
 ; P9LE-LABEL: spltCnstConvftoull:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    addis r3, r2, .LCPI110_0@toc@ha
-; P9LE-NEXT:    addi r3, r3, .LCPI110_0@toc@l
-; P9LE-NEXT:    lxv v2, 0(r3)
+; P9LE-NEXT:    vspltisw v2, 4
+; P9LE-NEXT:    vupklsw v2, v2
 ; P9LE-NEXT:    blr
 ;
 ; P8BE-LABEL: spltCnstConvftoull:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    addis r3, r2, .LCPI110_0@toc@ha
-; P8BE-NEXT:    addi r3, r3, .LCPI110_0@toc@l
-; P8BE-NEXT:    lxvd2x v2, 0, r3
+; P8BE-NEXT:    vspltisw v2, 4
+; P8BE-NEXT:    vupklsw v2, v2
 ; P8BE-NEXT:    blr
 ;
 ; P8LE-LABEL: spltCnstConvftoull:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    addis r3, r2, .LCPI110_0@toc@ha
-; P8LE-NEXT:    addi r3, r3, .LCPI110_0@toc@l
-; P8LE-NEXT:    lxvd2x v2, 0, r3
+; P8LE-NEXT:    vspltisw v2, 4
+; P8LE-NEXT:    vupklsw v2, v2
 ; P8LE-NEXT:    blr
 entry:
   ret <2 x i64> <i64 4, i64 4>
@@ -5692,30 +5672,26 @@ entry:
 define <2 x i64> @spltCnstConvdtoull() {
 ; P9BE-LABEL: spltCnstConvdtoull:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    addis r3, r2, .LCPI119_0@toc@ha
-; P9BE-NEXT:    addi r3, r3, .LCPI119_0@toc@l
-; P9BE-NEXT:    lxv v2, 0(r3)
+; P9BE-NEXT:    vspltisw v2, 4
+; P9BE-NEXT:    vupklsw v2, v2
 ; P9BE-NEXT:    blr
 ;
 ; P9LE-LABEL: spltCnstConvdtoull:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    addis r3, r2, .LCPI119_0@toc@ha
-; P9LE-NEXT:    addi r3, r3, .LCPI119_0@toc@l
-; P9LE-NEXT:    lxv v2, 0(r3)
+; P9LE-NEXT:    vspltisw v2, 4
+; P9LE-NEXT:    vupklsw v2, v2
 ; P9LE-NEXT:    blr
 ;
 ; P8BE-LABEL: spltCnstConvdtoull:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    addis r3, r2, .LCPI119_0@toc@ha
-; P8BE-NEXT:    addi r3, r3, .LCPI119_0@toc@l
-; P8BE-NEXT:    lxvd2x v2, 0, r3
+; P8BE-NEXT:    vspltisw v2, 4
+; P8BE-NEXT:    vupklsw v2, v2
 ; P8BE-NEXT:    blr
 ;
 ; P8LE-LABEL: spltCnstConvdtoull:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    addis r3, r2, .LCPI119_0@toc@ha
-; P8LE-NEXT:    addi r3, r3, .LCPI119_0@toc@l
-; P8LE-NEXT:    lxvd2x v2, 0, r3
+; P8LE-NEXT:    vspltisw v2, 4
+; P8LE-NEXT:    vupklsw v2, v2
 ; P8LE-NEXT:    blr
 entry:
   ret <2 x i64> <i64 4, i64 4>
diff --git a/llvm/test/CodeGen/PowerPC/dmr-spill.ll b/llvm/test/CodeGen/PowerPC/dmr-spill.ll
index b224643..c1b01cd 100644
--- a/llvm/test/CodeGen/PowerPC/dmr-spill.ll
+++ b/llvm/test/CodeGen/PowerPC/dmr-spill.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
-; RUN:   -disable-auto-paired-vec-st=false -ppc-asm-full-reg-names \
+; RUN:   -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr -mcpu=future < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-aix \
-; RUN:   -disable-auto-paired-vec-st=false -ppc-asm-full-reg-names \
+; RUN:   -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr -mcpu=future < %s | FileCheck %s --check-prefix=AIX
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-aix \
-; RUN:   -disable-auto-paired-vec-st=false -ppc-asm-full-reg-names \
+; RUN:   -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr -mcpu=future < %s | FileCheck %s --check-prefix=AIX32
 
 declare <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2pp(<1024 x i1>, <256 x i1>, <16 x i8>)
diff --git a/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll b/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll
index c2c8a42..8dd17ab 100644
--- a/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll
+++ b/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll
@@ -1,11 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; This test is a copy of mma-acc-spill.ll except that it uses mcpu=future.
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
-; RUN:   -disable-auto-paired-vec-st=false \
 ; RUN:   -mcpu=future -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
-; RUN:   -disable-auto-paired-vec-st=false \
 ; RUN:   -mcpu=future -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
 
diff --git a/llvm/test/CodeGen/PowerPC/mul-const-vector.ll b/llvm/test/CodeGen/PowerPC/mul-const-vector.ll
index e3d231a..2d67de0 100644
--- a/llvm/test/CodeGen/PowerPC/mul-const-vector.ll
+++ b/llvm/test/CodeGen/PowerPC/mul-const-vector.ll
@@ -271,8 +271,7 @@ define <2 x i64> @test1_v2i64(<2 x i64> %a) {
         ret <2 x i64> %tmp.1
 }
 ; CHECK-LABEL: test1_v2i64:
-; CHECK-P8: lxvd2x v[[REG1:[0-9]+]], 0, r{{[0-9]+}}
-; CHECK-P9: lxv v[[REG2:[0-9]+]], 0(r{{[0-9]+}})
+; CHECK: vupklsw v[[REG1:[0-9]+]], v{{[0-9]+}}
 ; CHECK-NOT: vmul
 ; CHECK-NEXT: vsld v{{[0-9]+}}, v2, v[[REG2]]
 
@@ -282,8 +281,7 @@ define <2 x i64> @test2_v2i64(<2 x i64> %a) {
 }
 
 ; CHECK-LABEL: test2_v2i64:
-; CHECK-P8: lxvd2x v[[REG1:[0-9]+]], 0, r{{[0-9]+}}
-; CHECK-P9: lxv v[[REG2:[0-9]+]], 0(r{{[0-9]+}})
+; CHECK: vupklsw v[[REG1:[0-9]+]], v{{[0-9]+}}
 ; CHECK-NOT: vmul
 ; CHECK-NEXT: vsld v[[REG3:[0-9]+]], v2, v[[REG2]]
 ; CHECK-NEXT: vaddudm v{{[0-9]+}}, v2, v[[REG3]]
@@ -294,8 +292,7 @@ define <2 x i64> @test3_v2i64(<2 x i64> %a) {
 }
 
 ; CHECK-LABEL: test3_v2i64:
-; CHECK-P8: lxvd2x v[[REG1:[0-9]+]], 0, r{{[0-9]+}}
-; CHECK-P9: lxv v[[REG2:[0-9]+]], 0(r{{[0-9]+}})
+; CHECK: vupklsw v[[REG1:[0-9]+]], v{{[0-9]+}}
 ; CHECK-NOT: vmul
 ; CHECK-NEXT: vsld v[[REG3:[0-9]+]], v2, v[[REG2]]
 ; CHECK-NEXT: vsubudm v{{[0-9]+}}, v[[REG3]], v2
@@ -308,8 +305,7 @@ define <2 x i64> @test4_v2i64(<2 x i64> %a) {
 }
 
 ; CHECK-LABEL: test4_v2i64:
-; CHECK-P8: lxvd2x v[[REG1:[0-9]+]], 0, r{{[0-9]+}}
-; CHECK-P9: lxv v[[REG2:[0-9]+]], 0(r{{[0-9]+}})
+; CHECK: vupklsw v[[REG1:[0-9]+]], v{{[0-9]+}}
 ; CHECK-NOT: vmul
 ; CHECK-NEXT: vsld v[[REG3:[0-9]+]], v2, v[[REG2]]
 ; CHECK-P8-NEXT: xxlxor v[[REG4:[0-9]+]],
@@ -322,8 +318,7 @@ define <2 x i64> @test5_v2i64(<2 x i64> %a) {
 }
 
 ; CHECK-LABEL: test5_v2i64:
-; CHECK-P8: lxvd2x v[[REG1:[0-9]+]], 0, r{{[0-9]+}}
-; CHECK-P9: lxv v[[REG2:[0-9]+]], 0(r{{[0-9]+}})
+; CHECK: vupklsw v[[REG1:[0-9]+]], v{{[0-9]+}}
 ; CHECK-NOT: vmul
 ; CHECK-NEXT: vsld v[[REG3:[0-9]+]], v2, v[[REG2]]
 ; CHECK-NEXT: vaddudm v[[REG4:[0-9]+]], v2, v[[REG3]]
@@ -337,8 +332,7 @@ define <2 x i64> @test6_v2i64(<2 x i64> %a) {
 }
 
 ; CHECK-LABEL: test6_v2i64:
-; CHECK-P8: lxvd2x v[[REG1:[0-9]+]], 0, r{{[0-9]+}}
-; CHECK-P9: lxv v[[REG2:[0-9]+]], 0(r{{[0-9]+}})
+; CHECK: vupklsw v[[REG1:[0-9]+]], v{{[0-9]+}}
 ; CHECK-NOT: vmul
 ; CHECK-NEXT: vsld v[[REG3:[0-9]+]], v2, v[[REG2]]
 ; CHECK-NEXT: vsubudm v{{[0-9]+}}, v2, v[[REG3]]
diff --git a/llvm/test/CodeGen/PowerPC/p10-splatImm-CPload-pcrel.ll b/llvm/test/CodeGen/PowerPC/p10-splatImm-CPload-pcrel.ll
index 842cb92..1ab74e6 100644
--- a/llvm/test/CodeGen/PowerPC/p10-splatImm-CPload-pcrel.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-splatImm-CPload-pcrel.ll
@@ -105,9 +105,8 @@ define dso_local <2 x double> @testDoubleToDoubleNaNFail() local_unnamed_addr {
 ;
 ; CHECK-NOPREFIX-LABEL: testDoubleToDoubleNaNFail:
 ; CHECK-NOPREFIX:       # %bb.0: # %entry
-; CHECK-NOPREFIX-NEXT:    addis r3, r2, .LCPI2_0@toc@ha
-; CHECK-NOPREFIX-NEXT:    addi r3, r3, .LCPI2_0@toc@l
-; CHECK-NOPREFIX-NEXT:    lxv vs34, 0(r3)
+; CHECK-NOPREFIX-NEXT:    vspltisw v2, -16
+; CHECK-NOPREFIX-NEXT:    vupklsw v2, v2
 ; CHECK-NOPREFIX-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: testDoubleToDoubleNaNFail:
diff --git a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll
index 4435484..6b29c78 100644
--- a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll
+++ b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll
@@ -22,10 +22,10 @@ define void @test64(ptr nocapture readonly %pix2, i32 signext %i_pix2) {
 ; P9LE-NEXT:    lfdx 0, 3, 4
 ; P9LE-NEXT:    addis 3, 2, .LCPI0_0@toc@ha
 ; P9LE-NEXT:    xxlxor 2, 2, 2
-; P9LE-NEXT:    vspltisw 4, 8
+; P9LE-NEXT:    xxspltib 4, 16
 ; P9LE-NEXT:    lxsd 3, 4(5)
 ; P9LE-NEXT:    addi 3, 3, .LCPI0_0@toc@l
-; P9LE-NEXT:    vadduwm 4, 4, 4
+; P9LE-NEXT:    vextsb2w 4, 4
 ; P9LE-NEXT:    lxv 1, 0(3)
 ; P9LE-NEXT:    addis 3, 2, .LCPI0_1@toc@ha
 ; P9LE-NEXT:    addi 3, 3, .LCPI0_1@toc@l
@@ -45,10 +45,10 @@ define void @test64(ptr nocapture readonly %pix2, i32 signext %i_pix2) {
 ; P9BE-NEXT:    lxsdx 2, 3, 4
 ; P9BE-NEXT:    addis 3, 2, .LCPI0_0@toc@ha
 ; P9BE-NEXT:    xxlxor 1, 1, 1
-; P9BE-NEXT:    vspltisw 4, 8
+; P9BE-NEXT:    xxspltib 4, 16
 ; P9BE-NEXT:    lxsd 3, 4(5)
 ; P9BE-NEXT:    addi 3, 3, .LCPI0_0@toc@l
-; P9BE-NEXT:    vadduwm 4, 4, 4
+; P9BE-NEXT:    vextsb2w 4, 4
 ; P9BE-NEXT:    lxv 0, 0(3)
 ; P9BE-NEXT:    addis 3, 2, .LCPI0_1@toc@ha
 ; P9BE-NEXT:    addi 3, 3, .LCPI0_1@toc@l
@@ -68,11 +68,11 @@ define void @test64(ptr nocapture readonly %pix2, i32 signext %i_pix2) {
 ; P9BE-AIX-NEXT:    lxsdx 2, 3, 4
 ; P9BE-AIX-NEXT:    ld 3, L..C0(2) # %const.0
 ; P9BE-AIX-NEXT:    xxlxor 1, 1, 1
-; P9BE-AIX-NEXT:    vspltisw 4, 8
+; P9BE-AIX-NEXT:    xxspltib 4, 16
 ; P9BE-AIX-NEXT:    lxsd 3, 4(5)
 ; P9BE-AIX-NEXT:    lxv 0, 0(3)
 ; P9BE-AIX-NEXT:    ld 3, L..C1(2) # %const.1
-; P9BE-AIX-NEXT:    vadduwm 4, 4, 4
+; P9BE-AIX-NEXT:    vextsb2w 4, 4
 ; P9BE-AIX-NEXT:    xxperm 2, 1, 0
 ; P9BE-AIX-NEXT:    lxv 0, 0(3)
 ; P9BE-AIX-NEXT:    xxperm 3, 3, 0
@@ -89,10 +89,10 @@ define void @test64(ptr nocapture readonly %pix2, i32 signext %i_pix2) {
 ; P9BE-AIX32-NEXT:    lxvwsx 0, 3, 4
 ; P9BE-AIX32-NEXT:    li 3, 4
 ; P9BE-AIX32-NEXT:    xxlxor 2, 2, 2
-; P9BE-AIX32-NEXT:    vspltisw 4, 8
+; P9BE-AIX32-NEXT:    xxspltib 4, 16
 ; P9BE-AIX32-NEXT:    lxvwsx 1, 5, 3
 ; P9BE-AIX32-NEXT:    lwz 3, L..C0(2) # %const.0
-; P9BE-AIX32-NEXT:    vadduwm 4, 4, 4
+; P9BE-AIX32-NEXT:    vextsb2w 4, 4
 ; P9BE-AIX32-NEXT:    xxmrghw 2, 0, 1
 ; P9BE-AIX32-NEXT:    lxv 0, 0(3)
 ; P9BE-AIX32-NEXT:    li 3, 8
@@ -137,11 +137,11 @@ define void @test32(ptr nocapture readonly %pix2, i32 signext %i_pix2) {
 ; P9LE-NEXT:    lxsiwzx 2, 3, 4
 ; P9LE-NEXT:    addis 3, 2, .LCPI1_0@toc@ha
 ; P9LE-NEXT:    xxlxor 0, 0, 0
-; P9LE-NEXT:    vspltisw 4, 8
+; P9LE-NEXT:    xxspltib 4, 16
 ; P9LE-NEXT:    addi 3, 3, .LCPI1_0@toc@l
 ; P9LE-NEXT:    lxv 1, 0(3)
 ; P9LE-NEXT:    li 3, 4
-; P9LE-NEXT:    vadduwm 4, 4, 4
+; P9LE-NEXT:    vextsb2w 4, 4
 ; P9LE-NEXT:    lxsiwzx 3, 5, 3
 ; P9LE-NEXT:    xxperm 2, 0, 1
 ; P9LE-NEXT:    xxperm 3, 0, 1
@@ -158,11 +158,11 @@ define void @test32(ptr nocapture readonly %pix2, i32 signext %i_pix2) {
 ; P9BE-NEXT:    lxsiwzx 2, 3, 4
 ; P9BE-NEXT:    addis 3, 2, .LCPI1_0@toc@ha
 ; P9BE-NEXT:    xxlxor 0, 0, 0
-; P9BE-NEXT:    vspltisw 4, 8
+; P9BE-NEXT:    xxspltib 4, 16
 ; P9BE-NEXT:    addi 3, 3, .LCPI1_0@toc@l
 ; P9BE-NEXT:    lxv 1, 0(3)
 ; P9BE-NEXT:    li 3, 4
-; P9BE-NEXT:    vadduwm 4, 4, 4
+; P9BE-NEXT:    vextsb2w 4, 4
 ; P9BE-NEXT:    lxsiwzx 3, 5, 3
 ; P9BE-NEXT:    xxperm 2, 0, 1
 ; P9BE-NEXT:    xxperm 3, 0, 1
@@ -179,10 +179,10 @@ define void @test32(ptr nocapture readonly %pix2, i32 signext %i_pix2) {
 ; P9BE-AIX-NEXT:    lxsiwzx 2, 3, 4
 ; P9BE-AIX-NEXT:    ld 3, L..C2(2) # %const.0
 ; P9BE-AIX-NEXT:    xxlxor 0, 0, 0
-; P9BE-AIX-NEXT:    vspltisw 4, 8
+; P9BE-AIX-NEXT:    xxspltib 4, 16
 ; P9BE-AIX-NEXT:    lxv 1, 0(3)
 ; P9BE-AIX-NEXT:    li 3, 4
-; P9BE-AIX-NEXT:    vadduwm 4, 4, 4
+; P9BE-AIX-NEXT:    vextsb2w 4, 4
 ; P9BE-AIX-NEXT:    lxsiwzx 3, 5, 3
 ; P9BE-AIX-NEXT:    xxperm 2, 0, 1
 ; P9BE-AIX-NEXT:    xxperm 3, 0, 1
@@ -199,10 +199,10 @@ define void @test32(ptr nocapture readonly %pix2, i32 signext %i_pix2) {
 ; P9BE-AIX32-NEXT:    lxsiwzx 2, 3, 4
 ; P9BE-AIX32-NEXT:    lwz 3, L..C2(2) # %const.0
 ; P9BE-AIX32-NEXT:    xxlxor 0, 0, 0
-; P9BE-AIX32-NEXT:    vspltisw 4, 8
+; P9BE-AIX32-NEXT:    xxspltib 4, 16
 ; P9BE-AIX32-NEXT:    lxv 1, 0(3)
 ; P9BE-AIX32-NEXT:    li 3, 4
-; P9BE-AIX32-NEXT:    vadduwm 4, 4, 4
+; P9BE-AIX32-NEXT:    vextsb2w 4, 4
 ; P9BE-AIX32-NEXT:    lxsiwzx 3, 5, 3
 ; P9BE-AIX32-NEXT:    xxperm 2, 0, 1
 ; P9BE-AIX32-NEXT:    xxperm 3, 0, 1
diff --git a/llvm/test/CodeGen/PowerPC/splat-extend.ll b/llvm/test/CodeGen/PowerPC/splat-extend.ll
new file mode 100644
index 0000000..5621f52
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/splat-extend.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+; RUN: llc -verify-machineinstrs  -mtriple=powerpc64-aix-xcoff \
+; RUN:   -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+; RUN: llc -verify-machineinstrs  -mtriple=powerpc-aix-xcoff \
+; RUN:   -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+
+define dso_local noundef <8 x i16> @v103s() local_unnamed_addr {
+; CHECK-LABEL: v103s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v2, 103
+; CHECK-NEXT:    vupklsb v2, v2
+; CHECK-NEXT:    blr
+entry:
+  ret <8 x i16> splat (i16 103)
+}
+
+define dso_local noundef <2 x i64> @v103l() local_unnamed_addr {
+; CHECK-LABEL: v103l:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v2, 103
+; CHECK-NEXT:    vextsb2d v2, v2
+; CHECK-NEXT:    blr
+entry:
+  ret <2 x i64> splat (i64 103)
+}
+
+define dso_local noundef <4 x i32> @v103i() local_unnamed_addr {
+; CHECK-LABEL: v103i:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v2, 103
+; CHECK-NEXT:    vextsb2w v2, v2
+; CHECK-NEXT:    blr
+entry:
+  ret <4 x i32> splat (i32 103)
+}
+
+define dso_local noundef <2 x i64> @v11l() local_unnamed_addr {
+; CHECK-LABEL: v11l:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vspltisw v2, -11
+; CHECK-NEXT:    vupklsw v2, v2
+; CHECK-NEXT:    blr
+entry:
+  ret <2 x i64> splat (i64 -11)
+}
diff --git a/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll b/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll
index 3f7e0b6..210aee1 100644
--- a/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll
@@ -16,9 +16,8 @@ define <2 x i64> @test_add(<2 x i64> %x, <2 x i64> %y) nounwind {
 define <2 x i64> @increment_by_one(<2 x i64> %x) nounwind {
 ; VSX-LABEL: increment_by_one:
 ; VSX:       # %bb.0:
-; VSX-NEXT:    addis 3, 2, .LCPI1_0@toc@ha
-; VSX-NEXT:    addi 3, 3, .LCPI1_0@toc@l
-; VSX-NEXT:    lxvd2x 35, 0, 3
+; VSX-NEXT:    vspltisw 3, 1
+; VSX-NEXT:    vupklsw 3, 3
 ; VSX-NEXT:    vaddudm 2, 2, 3
 ; VSX-NEXT:    blr
 ;
diff --git a/llvm/test/CodeGen/PowerPC/vector-extend-sign.ll b/llvm/test/CodeGen/PowerPC/vector-extend-sign.ll
index 540a00f..ef3988e 100644
--- a/llvm/test/CodeGen/PowerPC/vector-extend-sign.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-extend-sign.ll
@@ -144,9 +144,8 @@ entry:
 define <2 x i64> @test_none(<2 x i64> %m) {
 ; CHECK-P9-LABEL: test_none:
 ; CHECK-P9:       # %bb.0: # %entry
-; CHECK-P9-NEXT:    addis 3, 2, .LCPI5_0@toc@ha
-; CHECK-P9-NEXT:    addi 3, 3, .LCPI5_0@toc@l
-; CHECK-P9-NEXT:    lxv 35, 0(3)
+; CHECK-P9-NEXT:    xxspltib 35, 16
+; CHECK-P9-NEXT:    vextsb2d 3, 3
 ; CHECK-P9-NEXT:    vsld 2, 2, 3
 ; CHECK-P9-NEXT:    vsrad 2, 2, 3
 ; CHECK-P9-NEXT:    blr
diff --git a/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll b/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll
index 0435134..43cbc62 100644
--- a/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll
@@ -7838,9 +7838,9 @@ define <4 x i32> @ult_16_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ult_16_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, 8
+; PWR9-NEXT:    xxspltib 35, 16
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vadduwm 3, 3, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -7974,9 +7974,9 @@ define <4 x i32> @ugt_16_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ugt_16_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, 8
+; PWR9-NEXT:    xxspltib 35, 16
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vadduwm 3, 3, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -8108,10 +8108,9 @@ define <4 x i32> @ult_17_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ult_17_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, -16
-; PWR9-NEXT:    vspltisw 4, 1
+; PWR9-NEXT:    xxspltib 35, 17
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vsubuwm 3, 4, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -8243,10 +8242,9 @@ define <4 x i32> @ugt_17_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ugt_17_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, -16
-; PWR9-NEXT:    vspltisw 4, 1
+; PWR9-NEXT:    xxspltib 35, 17
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vsubuwm 3, 4, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -8380,9 +8378,9 @@ define <4 x i32> @ult_18_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ult_18_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, 9
+; PWR9-NEXT:    xxspltib 35, 18
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vadduwm 3, 3, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -8516,9 +8514,9 @@ define <4 x i32> @ugt_18_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ugt_18_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, 9
+; PWR9-NEXT:    xxspltib 35, 18
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vadduwm 3, 3, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -8653,10 +8651,9 @@ define <4 x i32> @ult_19_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ult_19_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, -16
-; PWR9-NEXT:    vspltisw 4, 3
+; PWR9-NEXT:    xxspltib 35, 19
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vsubuwm 3, 4, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -8791,10 +8788,9 @@ define <4 x i32> @ugt_19_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ugt_19_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, -16
-; PWR9-NEXT:    vspltisw 4, 3
+; PWR9-NEXT:    xxspltib 35, 19
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vsubuwm 3, 4, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -8928,9 +8924,9 @@ define <4 x i32> @ult_20_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ult_20_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, 10
+; PWR9-NEXT:    xxspltib 35, 20
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vadduwm 3, 3, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -9064,9 +9060,9 @@ define <4 x i32> @ugt_20_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ugt_20_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, 10
+; PWR9-NEXT:    xxspltib 35, 20
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vadduwm 3, 3, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -9201,10 +9197,9 @@ define <4 x i32> @ult_21_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ult_21_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, -16
-; PWR9-NEXT:    vspltisw 4, 5
+; PWR9-NEXT:    xxspltib 35, 21
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vsubuwm 3, 4, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -9339,10 +9334,9 @@ define <4 x i32> @ugt_21_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ugt_21_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, -16
-; PWR9-NEXT:    vspltisw 4, 5
+; PWR9-NEXT:    xxspltib 35, 21
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vsubuwm 3, 4, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -9476,9 +9470,9 @@ define <4 x i32> @ult_22_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ult_22_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, 11
+; PWR9-NEXT:    xxspltib 35, 22
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vadduwm 3, 3, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -9612,9 +9606,9 @@ define <4 x i32> @ugt_22_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ugt_22_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, 11
+; PWR9-NEXT:    xxspltib 35, 22
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vadduwm 3, 3, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -9749,10 +9743,9 @@ define <4 x i32> @ult_23_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ult_23_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, -16
-; PWR9-NEXT:    vspltisw 4, 7
+; PWR9-NEXT:    xxspltib 35, 23
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vsubuwm 3, 4, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -9887,10 +9880,9 @@ define <4 x i32> @ugt_23_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ugt_23_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, -16
-; PWR9-NEXT:    vspltisw 4, 7
+; PWR9-NEXT:    xxspltib 35, 23
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vsubuwm 3, 4, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -10018,9 +10010,9 @@ define <4 x i32> @ult_24_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ult_24_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, 12
+; PWR9-NEXT:    xxspltib 35, 24
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vadduwm 3, 3, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -10148,9 +10140,9 @@ define <4 x i32> @ugt_24_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ugt_24_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, 12
+; PWR9-NEXT:    xxspltib 35, 24
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vadduwm 3, 3, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -10285,10 +10277,9 @@ define <4 x i32> @ult_25_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ult_25_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, -16
-; PWR9-NEXT:    vspltisw 4, 9
+; PWR9-NEXT:    xxspltib 35, 25
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vsubuwm 3, 4, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -10423,10 +10414,9 @@ define <4 x i32> @ugt_25_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ugt_25_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, -16
-; PWR9-NEXT:    vspltisw 4, 9
+; PWR9-NEXT:    xxspltib 35, 25
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vsubuwm 3, 4, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -10560,9 +10550,9 @@ define <4 x i32> @ult_26_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ult_26_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, 13
+; PWR9-NEXT:    xxspltib 35, 26
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vadduwm 3, 3, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -10696,9 +10686,9 @@ define <4 x i32> @ugt_26_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ugt_26_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, 13
+; PWR9-NEXT:    xxspltib 35, 26
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vadduwm 3, 3, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -10833,10 +10823,9 @@ define <4 x i32> @ult_27_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ult_27_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, -16
-; PWR9-NEXT:    vspltisw 4, 11
+; PWR9-NEXT:    xxspltib 35, 27
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vsubuwm 3, 4, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -10971,10 +10960,9 @@ define <4 x i32> @ugt_27_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ugt_27_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, -16
-; PWR9-NEXT:    vspltisw 4, 11
+; PWR9-NEXT:    xxspltib 35, 27
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vsubuwm 3, 4, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -11108,9 +11096,9 @@ define <4 x i32> @ult_28_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ult_28_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, 14
+; PWR9-NEXT:    xxspltib 35, 28
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vadduwm 3, 3, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -11244,9 +11232,9 @@ define <4 x i32> @ugt_28_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ugt_28_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, 14
+; PWR9-NEXT:    xxspltib 35, 28
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vadduwm 3, 3, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -11381,10 +11369,9 @@ define <4 x i32> @ult_29_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ult_29_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, -16
-; PWR9-NEXT:    vspltisw 4, 13
+; PWR9-NEXT:    xxspltib 35, 29
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vsubuwm 3, 4, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -11519,10 +11506,9 @@ define <4 x i32> @ugt_29_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ugt_29_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, -16
-; PWR9-NEXT:    vspltisw 4, 13
+; PWR9-NEXT:    xxspltib 35, 29
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vsubuwm 3, 4, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -11656,9 +11642,9 @@ define <4 x i32> @ult_30_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ult_30_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, 15
+; PWR9-NEXT:    xxspltib 35, 30
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vadduwm 3, 3, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -11792,9 +11778,9 @@ define <4 x i32> @ugt_30_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ugt_30_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, 15
+; PWR9-NEXT:    xxspltib 35, 30
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vadduwm 3, 3, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -11929,10 +11915,9 @@ define <4 x i32> @ult_31_v4i32(<4 x i32> %0) {
 ;
 ; PWR9-LABEL: ult_31_v4i32:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    vspltisw 3, -16
-; PWR9-NEXT:    vspltisw 4, 15
+; PWR9-NEXT:    xxspltib 35, 31
 ; PWR9-NEXT:    vpopcntw 2, 2
-; PWR9-NEXT:    vsubuwm 3, 4, 3
+; PWR9-NEXT:    vextsb2w 3, 3
 ; PWR9-NEXT:    vcmpgtuw 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
@@ -11991,19 +11976,17 @@ define <2 x i64> @ugt_1_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ugt_1_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI100_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 1
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI100_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 2, 3
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ugt_1_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI100_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 1
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI100_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -12061,19 +12044,17 @@ define <2 x i64> @ult_2_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ult_2_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI101_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 2
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI101_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 3, 2
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ult_2_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI101_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 2
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI101_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -12195,19 +12176,17 @@ define <2 x i64> @ugt_2_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ugt_2_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI102_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 2
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI102_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 2, 3
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ugt_2_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI102_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 2
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI102_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -12329,19 +12308,17 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ult_3_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI103_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 3
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI103_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 3, 2
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ult_3_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI103_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 3
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI103_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -12463,19 +12440,17 @@ define <2 x i64> @ugt_3_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ugt_3_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI104_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 3
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI104_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 2, 3
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ugt_3_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI104_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 3
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI104_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -12597,19 +12572,17 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ult_4_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI105_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 4
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI105_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 3, 2
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ult_4_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI105_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 4
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI105_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -12731,19 +12704,17 @@ define <2 x i64> @ugt_4_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ugt_4_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI106_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 4
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI106_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 2, 3
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ugt_4_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI106_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 4
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI106_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -12865,19 +12836,17 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ult_5_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI107_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 5
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI107_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 3, 2
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ult_5_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI107_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 5
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI107_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -12999,19 +12968,17 @@ define <2 x i64> @ugt_5_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ugt_5_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI108_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 5
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI108_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 2, 3
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ugt_5_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI108_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 5
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI108_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -13133,19 +13100,17 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ult_6_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI109_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 6
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI109_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 3, 2
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ult_6_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI109_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 6
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI109_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -13267,19 +13232,17 @@ define <2 x i64> @ugt_6_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ugt_6_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI110_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 6
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI110_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 2, 3
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ugt_6_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI110_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 6
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI110_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -13401,19 +13364,17 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ult_7_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI111_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 7
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI111_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 3, 2
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ult_7_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI111_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 7
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI111_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -13535,19 +13496,17 @@ define <2 x i64> @ugt_7_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ugt_7_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI112_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 7
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI112_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 2, 3
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ugt_7_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI112_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 7
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI112_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -13669,19 +13628,17 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ult_8_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI113_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 8
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI113_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 3, 2
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ult_8_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI113_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 8
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI113_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -13803,19 +13760,17 @@ define <2 x i64> @ugt_8_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ugt_8_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI114_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 8
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI114_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 2, 3
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ugt_8_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI114_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 8
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI114_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -13937,19 +13892,17 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ult_9_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI115_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 9
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI115_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 3, 2
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ult_9_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI115_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 9
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI115_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -14071,19 +14024,17 @@ define <2 x i64> @ugt_9_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ugt_9_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI116_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 9
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI116_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 2, 3
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ugt_9_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI116_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 9
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI116_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -14205,19 +14156,17 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ult_10_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI117_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 10
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI117_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 3, 2
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ult_10_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI117_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 10
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI117_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -14339,19 +14288,17 @@ define <2 x i64> @ugt_10_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ugt_10_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI118_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 10
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI118_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 2, 3
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ugt_10_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI118_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 10
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI118_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -14473,19 +14420,17 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ult_11_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI119_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 11
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI119_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 3, 2
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ult_11_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI119_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 11
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI119_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -14607,19 +14552,17 @@ define <2 x i64> @ugt_11_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ugt_11_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI120_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 11
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI120_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 2, 3
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ugt_11_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI120_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 11
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI120_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -14741,19 +14684,17 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ult_12_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI121_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 12
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI121_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 3, 2
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ult_12_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI121_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 12
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI121_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -14875,19 +14816,17 @@ define <2 x i64> @ugt_12_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ugt_12_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI122_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 12
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI122_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 2, 3
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ugt_12_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI122_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 12
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI122_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -15009,19 +14948,17 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ult_13_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI123_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 13
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI123_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 3, 2
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ult_13_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI123_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 13
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI123_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -15143,19 +15080,17 @@ define <2 x i64> @ugt_13_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ugt_13_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI124_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 13
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI124_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 2, 3
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ugt_13_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI124_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 13
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI124_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -15277,19 +15212,17 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ult_14_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI125_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 14
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI125_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 3, 2
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ult_14_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI125_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 14
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI125_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -15411,19 +15344,17 @@ define <2 x i64> @ugt_14_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ugt_14_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI126_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 14
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI126_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 2, 3
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ugt_14_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI126_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 14
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI126_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -15545,19 +15476,17 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ult_15_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI127_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 15
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI127_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 3, 2
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ult_15_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI127_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 15
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI127_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -15679,19 +15608,17 @@ define <2 x i64> @ugt_15_v2i64(<2 x i64> %0) {
 ;
 ; PWR8-LABEL: ugt_15_v2i64:
 ; PWR8:       # %bb.0:
-; PWR8-NEXT:    addis 3, 2, .LCPI128_0@toc@ha
+; PWR8-NEXT:    vspltisw 3, 15
 ; PWR8-NEXT:    vpopcntd 2, 2
-; PWR8-NEXT:    addi 3, 3, .LCPI128_0@toc@l
-; PWR8-NEXT:    lxvd2x 35, 0, 3
+; PWR8-NEXT:    vupklsw 3, 3
 ; PWR8-NEXT:    vcmpgtud 2, 2, 3
 ; PWR8-NEXT:    blr
 ;
 ; PWR9-LABEL: ugt_15_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI128_0@toc@ha
+; PWR9-NEXT:    vspltisw 3, 15
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI128_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vupklsw 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -15822,10 +15749,9 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_16_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI129_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 16
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI129_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -15956,10 +15882,9 @@ define <2 x i64> @ugt_16_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_16_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI130_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 16
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI130_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -16090,10 +16015,9 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_17_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI131_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 17
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI131_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -16224,10 +16148,9 @@ define <2 x i64> @ugt_17_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_17_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI132_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 17
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI132_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -16358,10 +16281,9 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_18_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI133_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 18
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI133_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -16492,10 +16414,9 @@ define <2 x i64> @ugt_18_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_18_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI134_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 18
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI134_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -16626,10 +16547,9 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_19_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI135_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 19
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI135_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -16760,10 +16680,9 @@ define <2 x i64> @ugt_19_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_19_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI136_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 19
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI136_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -16894,10 +16813,9 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_20_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI137_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 20
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI137_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -17028,10 +16946,9 @@ define <2 x i64> @ugt_20_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_20_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI138_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 20
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI138_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -17162,10 +17079,9 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_21_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI139_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 21
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI139_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -17296,10 +17212,9 @@ define <2 x i64> @ugt_21_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_21_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI140_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 21
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI140_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -17430,10 +17345,9 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_22_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI141_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 22
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI141_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -17564,10 +17478,9 @@ define <2 x i64> @ugt_22_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_22_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI142_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 22
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI142_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -17698,10 +17611,9 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_23_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI143_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 23
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI143_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -17832,10 +17744,9 @@ define <2 x i64> @ugt_23_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_23_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI144_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 23
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI144_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -17966,10 +17877,9 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_24_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI145_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 24
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI145_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -18100,10 +18010,9 @@ define <2 x i64> @ugt_24_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_24_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI146_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 24
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI146_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -18234,10 +18143,9 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_25_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI147_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 25
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI147_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -18368,10 +18276,9 @@ define <2 x i64> @ugt_25_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_25_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI148_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 25
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI148_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -18502,10 +18409,9 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_26_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI149_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 26
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI149_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -18636,10 +18542,9 @@ define <2 x i64> @ugt_26_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_26_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI150_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 26
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI150_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -18770,10 +18675,9 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_27_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI151_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 27
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI151_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -18904,10 +18808,9 @@ define <2 x i64> @ugt_27_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_27_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI152_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 27
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI152_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -19038,10 +18941,9 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_28_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI153_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 28
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI153_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -19172,10 +19074,9 @@ define <2 x i64> @ugt_28_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_28_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI154_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 28
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI154_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -19306,10 +19207,9 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_29_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI155_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 29
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI155_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -19440,10 +19340,9 @@ define <2 x i64> @ugt_29_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_29_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI156_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 29
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI156_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -19574,10 +19473,9 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_30_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI157_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 30
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI157_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -19708,10 +19606,9 @@ define <2 x i64> @ugt_30_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_30_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI158_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 30
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI158_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -19842,10 +19739,9 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_31_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI159_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 31
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI159_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -19976,10 +19872,9 @@ define <2 x i64> @ugt_31_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_31_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI160_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 31
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI160_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -20110,10 +20005,9 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_32_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI161_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 32
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI161_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -20244,10 +20138,9 @@ define <2 x i64> @ugt_32_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_32_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI162_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 32
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI162_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -20378,10 +20271,9 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_33_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI163_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 33
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI163_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -20512,10 +20404,9 @@ define <2 x i64> @ugt_33_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_33_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI164_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 33
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI164_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -20646,10 +20537,9 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_34_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI165_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 34
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI165_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -20780,10 +20670,9 @@ define <2 x i64> @ugt_34_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_34_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI166_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 34
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI166_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -20914,10 +20803,9 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_35_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI167_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 35
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI167_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -21048,10 +20936,9 @@ define <2 x i64> @ugt_35_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_35_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI168_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 35
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI168_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -21182,10 +21069,9 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_36_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI169_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 36
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI169_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -21316,10 +21202,9 @@ define <2 x i64> @ugt_36_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_36_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI170_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 36
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI170_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -21450,10 +21335,9 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_37_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI171_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 37
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI171_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -21584,10 +21468,9 @@ define <2 x i64> @ugt_37_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_37_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI172_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 37
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI172_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -21718,10 +21601,9 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_38_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI173_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 38
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI173_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -21852,10 +21734,9 @@ define <2 x i64> @ugt_38_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_38_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI174_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 38
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI174_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -21986,10 +21867,9 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_39_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI175_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 39
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI175_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -22120,10 +22000,9 @@ define <2 x i64> @ugt_39_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_39_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI176_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 39
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI176_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -22254,10 +22133,9 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_40_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI177_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 40
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI177_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -22388,10 +22266,9 @@ define <2 x i64> @ugt_40_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_40_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI178_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 40
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI178_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -22522,10 +22399,9 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_41_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI179_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 41
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI179_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -22656,10 +22532,9 @@ define <2 x i64> @ugt_41_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_41_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI180_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 41
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI180_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -22790,10 +22665,9 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_42_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI181_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 42
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI181_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -22924,10 +22798,9 @@ define <2 x i64> @ugt_42_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_42_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI182_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 42
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI182_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -23058,10 +22931,9 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_43_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI183_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 43
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI183_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -23192,10 +23064,9 @@ define <2 x i64> @ugt_43_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_43_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI184_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 43
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI184_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -23326,10 +23197,9 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_44_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI185_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 44
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI185_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -23460,10 +23330,9 @@ define <2 x i64> @ugt_44_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_44_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI186_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 44
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI186_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -23594,10 +23463,9 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_45_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI187_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 45
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI187_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -23728,10 +23596,9 @@ define <2 x i64> @ugt_45_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_45_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI188_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 45
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI188_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -23862,10 +23729,9 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_46_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI189_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 46
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI189_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -23996,10 +23862,9 @@ define <2 x i64> @ugt_46_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_46_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI190_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 46
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI190_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -24130,10 +23995,9 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_47_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI191_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 47
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI191_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -24264,10 +24128,9 @@ define <2 x i64> @ugt_47_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_47_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI192_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 47
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI192_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -24398,10 +24261,9 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_48_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI193_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 48
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI193_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -24532,10 +24394,9 @@ define <2 x i64> @ugt_48_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_48_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI194_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 48
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI194_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -24666,10 +24527,9 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_49_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI195_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 49
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI195_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -24800,10 +24660,9 @@ define <2 x i64> @ugt_49_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_49_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI196_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 49
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI196_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -24934,10 +24793,9 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_50_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI197_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 50
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI197_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -25068,10 +24926,9 @@ define <2 x i64> @ugt_50_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_50_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI198_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 50
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI198_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -25202,10 +25059,9 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_51_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI199_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 51
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI199_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -25336,10 +25192,9 @@ define <2 x i64> @ugt_51_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_51_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI200_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 51
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI200_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -25470,10 +25325,9 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_52_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI201_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 52
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI201_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -25604,10 +25458,9 @@ define <2 x i64> @ugt_52_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_52_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI202_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 52
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI202_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -25738,10 +25591,9 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_53_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI203_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 53
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI203_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -25872,10 +25724,9 @@ define <2 x i64> @ugt_53_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_53_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI204_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 53
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI204_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -26006,10 +25857,9 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_54_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI205_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 54
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI205_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -26140,10 +25990,9 @@ define <2 x i64> @ugt_54_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_54_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI206_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 54
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI206_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -26274,10 +26123,9 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_55_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI207_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 55
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI207_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -26408,10 +26256,9 @@ define <2 x i64> @ugt_55_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_55_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI208_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 55
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI208_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -26542,10 +26389,9 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_56_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI209_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 56
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI209_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -26676,10 +26522,9 @@ define <2 x i64> @ugt_56_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_56_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI210_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 56
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI210_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -26810,10 +26655,9 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_57_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI211_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 57
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI211_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -26944,10 +26788,9 @@ define <2 x i64> @ugt_57_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_57_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI212_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 57
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI212_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -27078,10 +26921,9 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_58_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI213_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 58
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI213_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -27212,10 +27054,9 @@ define <2 x i64> @ugt_58_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_58_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI214_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 58
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI214_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -27346,10 +27187,9 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_59_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI215_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 59
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI215_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -27480,10 +27320,9 @@ define <2 x i64> @ugt_59_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_59_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI216_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 59
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI216_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -27614,10 +27453,9 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_60_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI217_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 60
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI217_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -27748,10 +27586,9 @@ define <2 x i64> @ugt_60_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_60_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI218_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 60
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI218_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -27882,10 +27719,9 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_61_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI219_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 61
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI219_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -28016,10 +27852,9 @@ define <2 x i64> @ugt_61_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_61_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI220_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 61
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI220_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -28150,10 +27985,9 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_62_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI221_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 62
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI221_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -28284,10 +28118,9 @@ define <2 x i64> @ugt_62_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ugt_62_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI222_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 62
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI222_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 2, 3
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
@@ -28418,10 +28251,9 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) {
 ;
 ; PWR9-LABEL: ult_63_v2i64:
 ; PWR9:       # %bb.0:
-; PWR9-NEXT:    addis 3, 2, .LCPI223_0@toc@ha
+; PWR9-NEXT:    xxspltib 35, 63
 ; PWR9-NEXT:    vpopcntd 2, 2
-; PWR9-NEXT:    addi 3, 3, .LCPI223_0@toc@l
-; PWR9-NEXT:    lxv 35, 0(3)
+; PWR9-NEXT:    vextsb2d 3, 3
 ; PWR9-NEXT:    vcmpgtud 2, 3, 2
 ; PWR9-NEXT:    blr
   %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
diff --git a/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll b/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll
index 949668f..7c5332f 100644
--- a/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll
+++ b/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll
@@ -471,15 +471,15 @@ define i32 @fcmp_ueq(double %a, double %b) nounwind strictfp {
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    mv s3, a0
-; RV32I-NEXT:    call __eqdf2
-; RV32I-NEXT:    seqz s4, a0
+; RV32I-NEXT:    call __unorddf2
+; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    mv a0, s3
 ; RV32I-NEXT:    mv a1, s2
 ; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
-; RV32I-NEXT:    call __unorddf2
-; RV32I-NEXT:    snez a0, a0
-; RV32I-NEXT:    or a0, a0, s4
+; RV32I-NEXT:    call __eqdf2
+; RV32I-NEXT:    seqz a0, a0
+; RV32I-NEXT:    or a0, s4, a0
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -498,13 +498,13 @@ define i32 @fcmp_ueq(double %a, double %b) nounwind strictfp {
 ; RV64I-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    call __eqdf2
-; RV64I-NEXT:    seqz s2, a0
+; RV64I-NEXT:    call __unorddf2
+; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    mv a1, s0
-; RV64I-NEXT:    call __unorddf2
-; RV64I-NEXT:    snez a0, a0
-; RV64I-NEXT:    or a0, a0, s2
+; RV64I-NEXT:    call __eqdf2
+; RV64I-NEXT:    seqz a0, a0
+; RV64I-NEXT:    or a0, s2, a0
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -1199,15 +1199,15 @@ define i32 @fcmps_ueq(double %a, double %b) nounwind strictfp {
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    mv s3, a0
-; RV32I-NEXT:    call __eqdf2
-; RV32I-NEXT:    seqz s4, a0
+; RV32I-NEXT:    call __unorddf2
+; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    mv a0, s3
 ; RV32I-NEXT:    mv a1, s2
 ; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
-; RV32I-NEXT:    call __unorddf2
-; RV32I-NEXT:    snez a0, a0
-; RV32I-NEXT:    or a0, a0, s4
+; RV32I-NEXT:    call __eqdf2
+; RV32I-NEXT:    seqz a0, a0
+; RV32I-NEXT:    or a0, s4, a0
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -1226,13 +1226,13 @@ define i32 @fcmps_ueq(double %a, double %b) nounwind strictfp {
 ; RV64I-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    call __eqdf2
-; RV64I-NEXT:    seqz s2, a0
+; RV64I-NEXT:    call __unorddf2
+; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    mv a1, s0
-; RV64I-NEXT:    call __unorddf2
-; RV64I-NEXT:    snez a0, a0
-; RV64I-NEXT:    or a0, a0, s2
+; RV64I-NEXT:    call __eqdf2
+; RV64I-NEXT:    seqz a0, a0
+; RV64I-NEXT:    or a0, s2, a0
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/double-fcmp.ll b/llvm/test/CodeGen/RISCV/double-fcmp.ll
index 1e609f8..f73e686 100644
--- a/llvm/test/CodeGen/RISCV/double-fcmp.ll
+++ b/llvm/test/CodeGen/RISCV/double-fcmp.ll
@@ -403,15 +403,15 @@ define i32 @fcmp_ueq(double %a, double %b) nounwind {
 ; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    mv s3, a0
-; RV32I-NEXT:    call __eqdf2
-; RV32I-NEXT:    seqz s4, a0
+; RV32I-NEXT:    call __unorddf2
+; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    mv a0, s3
 ; RV32I-NEXT:    mv a1, s2
 ; RV32I-NEXT:    mv a2, s1
 ; RV32I-NEXT:    mv a3, s0
-; RV32I-NEXT:    call __unorddf2
-; RV32I-NEXT:    snez a0, a0
-; RV32I-NEXT:    or a0, a0, s4
+; RV32I-NEXT:    call __eqdf2
+; RV32I-NEXT:    seqz a0, a0
+; RV32I-NEXT:    or a0, s4, a0
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -430,13 +430,13 @@ define i32 @fcmp_ueq(double %a, double %b) nounwind {
 ; RV64I-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    call __eqdf2
-; RV64I-NEXT:    seqz s2, a0
+; RV64I-NEXT:    call __unorddf2
+; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    mv a1, s0
-; RV64I-NEXT:    call __unorddf2
-; RV64I-NEXT:    snez a0, a0
-; RV64I-NEXT:    or a0, a0, s2
+; RV64I-NEXT:    call __eqdf2
+; RV64I-NEXT:    seqz a0, a0
+; RV64I-NEXT:    or a0, s2, a0
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll b/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll
index 0cbfc96..fd3baa0 100644
--- a/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll
+++ b/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll
@@ -382,13 +382,13 @@ define i32 @fcmp_ueq(float %a, float %b) nounwind strictfp {
 ; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    call __eqsf2
-; RV32I-NEXT:    seqz s2, a0
+; RV32I-NEXT:    call __unordsf2
+; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    call __unordsf2
-; RV32I-NEXT:    snez a0, a0
-; RV32I-NEXT:    or a0, a0, s2
+; RV32I-NEXT:    call __eqsf2
+; RV32I-NEXT:    seqz a0, a0
+; RV32I-NEXT:    or a0, s2, a0
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -405,13 +405,13 @@ define i32 @fcmp_ueq(float %a, float %b) nounwind strictfp {
 ; RV64I-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    call __eqsf2
-; RV64I-NEXT:    seqz s2, a0
+; RV64I-NEXT:    call __unordsf2
+; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    mv a1, s0
-; RV64I-NEXT:    call __unordsf2
-; RV64I-NEXT:    snez a0, a0
-; RV64I-NEXT:    or a0, a0, s2
+; RV64I-NEXT:    call __eqsf2
+; RV64I-NEXT:    seqz a0, a0
+; RV64I-NEXT:    or a0, s2, a0
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -991,13 +991,13 @@ define i32 @fcmps_ueq(float %a, float %b) nounwind strictfp {
 ; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    call __eqsf2
-; RV32I-NEXT:    seqz s2, a0
+; RV32I-NEXT:    call __unordsf2
+; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    call __unordsf2
-; RV32I-NEXT:    snez a0, a0
-; RV32I-NEXT:    or a0, a0, s2
+; RV32I-NEXT:    call __eqsf2
+; RV32I-NEXT:    seqz a0, a0
+; RV32I-NEXT:    or a0, s2, a0
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1014,13 +1014,13 @@ define i32 @fcmps_ueq(float %a, float %b) nounwind strictfp {
 ; RV64I-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    call __eqsf2
-; RV64I-NEXT:    seqz s2, a0
+; RV64I-NEXT:    call __unordsf2
+; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    mv a1, s0
-; RV64I-NEXT:    call __unordsf2
-; RV64I-NEXT:    snez a0, a0
-; RV64I-NEXT:    or a0, a0, s2
+; RV64I-NEXT:    call __eqsf2
+; RV64I-NEXT:    seqz a0, a0
+; RV64I-NEXT:    or a0, s2, a0
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/float-fcmp.ll b/llvm/test/CodeGen/RISCV/float-fcmp.ll
index 265d553..2e9c39f 100644
--- a/llvm/test/CodeGen/RISCV/float-fcmp.ll
+++ b/llvm/test/CodeGen/RISCV/float-fcmp.ll
@@ -344,13 +344,13 @@ define i32 @fcmp_ueq(float %a, float %b) nounwind {
 ; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    call __eqsf2
-; RV32I-NEXT:    seqz s2, a0
+; RV32I-NEXT:    call __unordsf2
+; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    call __unordsf2
-; RV32I-NEXT:    snez a0, a0
-; RV32I-NEXT:    or a0, a0, s2
+; RV32I-NEXT:    call __eqsf2
+; RV32I-NEXT:    seqz a0, a0
+; RV32I-NEXT:    or a0, s2, a0
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -367,13 +367,13 @@ define i32 @fcmp_ueq(float %a, float %b) nounwind {
 ; RV64I-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a1
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    call __eqsf2
-; RV64I-NEXT:    seqz s2, a0
+; RV64I-NEXT:    call __unordsf2
+; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    mv a0, s1
 ; RV64I-NEXT:    mv a1, s0
-; RV64I-NEXT:    call __unordsf2
-; RV64I-NEXT:    snez a0, a0
-; RV64I-NEXT:    or a0, a0, s2
+; RV64I-NEXT:    call __eqsf2
+; RV64I-NEXT:    seqz a0, a0
+; RV64I-NEXT:    or a0, s2, a0
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/fold-masked-merge.ll b/llvm/test/CodeGen/RISCV/fold-masked-merge.ll
new file mode 100644
index 0000000..631b710
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/fold-masked-merge.ll
@@ -0,0 +1,302 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-I,RV32,RV32I
+; RUN: llc -mtriple=riscv64 < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-I,RV64,RV64I
+; RUN: llc -mtriple=riscv32 -mattr=+zbb < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV32,RV32ZBB
+; RUN: llc -mtriple=riscv64 -mattr=+zbb < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV64,RV64ZBB
+;
+; test that masked-merge code is generated as "xor;and;xor" sequence or
+; "andn ; and; or" if and-not is available.
+
+define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
+; CHECK-I-LABEL: masked_merge0:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    xor a1, a1, a2
+; CHECK-I-NEXT:    and a0, a1, a0
+; CHECK-I-NEXT:    xor a0, a0, a2
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge0:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    andn a0, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a0
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) {
+; CHECK-I-LABEL: masked_merge1:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    xor a1, a1, a2
+; CHECK-I-NEXT:    and a0, a1, a0
+; CHECK-I-NEXT:    xor a0, a0, a2
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge1:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    andn a0, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a0
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i16 %a0, %a1
+  %not = xor i16 %a0, -1
+  %and1 = and i16 %a2, %not
+  %or = or i16 %and0, %and1
+  ret i16 %or
+}
+
+define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) {
+; CHECK-I-LABEL: masked_merge2:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    mv a0, a1
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge2:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    andn a2, a1, a0
+; CHECK-ZBB-NEXT:    and a0, a1, a0
+; CHECK-ZBB-NEXT:    or a0, a2, a0
+; CHECK-ZBB-NEXT:    ret
+  %not = xor i8 %a0, -1
+  %and0 = and i8 %not, %a1
+  %and1 = and i8 %a1, %a0
+  %or = or i8 %and0, %and1
+  ret i8 %or
+}
+
+define i64 @masked_merge3(i64 %a0, i64 %a1, i64 %a2) {
+; RV32I-LABEL: masked_merge3:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a5, a5
+; RV32I-NEXT:    not a4, a4
+; RV32I-NEXT:    xor a3, a3, a5
+; RV32I-NEXT:    xor a2, a2, a4
+; RV32I-NEXT:    not a2, a2
+; RV32I-NEXT:    not a3, a3
+; RV32I-NEXT:    and a0, a2, a0
+; RV32I-NEXT:    and a1, a3, a1
+; RV32I-NEXT:    xor a0, a0, a4
+; RV32I-NEXT:    xor a1, a1, a5
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: masked_merge3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    not a2, a2
+; RV64I-NEXT:    xor a1, a1, a2
+; RV64I-NEXT:    not a1, a1
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    xor a0, a0, a2
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: masked_merge3:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a6, a0
+; RV32ZBB-NEXT:    not a7, a1
+; RV32ZBB-NEXT:    andn a1, a1, a3
+; RV32ZBB-NEXT:    andn a0, a0, a2
+; RV32ZBB-NEXT:    andn a2, a7, a5
+; RV32ZBB-NEXT:    andn a3, a6, a4
+; RV32ZBB-NEXT:    or a0, a3, a0
+; RV32ZBB-NEXT:    or a1, a2, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: masked_merge3:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    not a3, a0
+; RV64ZBB-NEXT:    andn a2, a3, a2
+; RV64ZBB-NEXT:    andn a0, a0, a1
+; RV64ZBB-NEXT:    or a0, a2, a0
+; RV64ZBB-NEXT:    ret
+  %v0 = xor i64 %a1, -1
+  %v1 = xor i64 %a2, -1
+  %not = xor i64 %a0, -1
+  %and0 = and i64 %not, %v1
+  %and1 = and i64 %v0, %a0
+  %or = or i64 %and0, %and1
+  ret i64 %or
+}
+
+define i32 @not_a_masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
+; RV32-LABEL: not_a_masked_merge0:
+; RV32:       # %bb.0:
+; RV32-NEXT:    and a1, a0, a1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: not_a_masked_merge0:
+; RV64:       # %bb.0:
+; RV64-NEXT:    and a1, a0, a1
+; RV64-NEXT:    negw a0, a0
+; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not_a_not = sub i32 0, %a0
+  %and1 = and i32 %not_a_not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-I-LABEL: not_a_masked_merge1:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    and a0, a0, a1
+; CHECK-I-NEXT:    not a1, a3
+; CHECK-I-NEXT:    and a1, a1, a2
+; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: not_a_masked_merge1:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a0, a0, a1
+; CHECK-ZBB-NEXT:    andn a1, a2, a3
+; CHECK-ZBB-NEXT:    or a0, a0, a1
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a3, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge2(i32 %a0, i32 %a1, i32 %a2) {
+; CHECK-I-LABEL: not_a_masked_merge2:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    or a1, a0, a1
+; CHECK-I-NEXT:    not a0, a0
+; CHECK-I-NEXT:    and a0, a0, a2
+; CHECK-I-NEXT:    or a0, a1, a0
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: not_a_masked_merge2:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    or a1, a0, a1
+; CHECK-ZBB-NEXT:    andn a0, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a0
+; CHECK-ZBB-NEXT:    ret
+  %not_an_and0 = or i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %not_an_and0, %and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge3(i32 %a0, i32 %a1, i32 %a2) {
+; CHECK-I-LABEL: not_a_masked_merge3:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    and a1, a0, a1
+; CHECK-I-NEXT:    xor a0, a0, a2
+; CHECK-I-NEXT:    not a0, a0
+; CHECK-I-NEXT:    or a0, a1, a0
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: not_a_masked_merge3:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    xor a0, a0, a2
+; CHECK-ZBB-NEXT:    orn a0, a1, a0
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %not_an_and1 = xor i32 %not, %a2
+  %or = or i32 %and0, %not_an_and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge4(i32 %a0, i32 %a1, i32 %a2) {
+; CHECK-LABEL: not_a_masked_merge4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a2, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i32 @masked_merge_no_transform0(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; CHECK-I-LABEL: masked_merge_no_transform0:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    and a1, a0, a1
+; CHECK-I-NEXT:    not a0, a0
+; CHECK-I-NEXT:    and a0, a0, a2
+; CHECK-I-NEXT:    or a0, a1, a0
+; CHECK-I-NEXT:    sw a1, 0(a3)
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge_no_transform0:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    andn a0, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a0
+; CHECK-ZBB-NEXT:    sw a1, 0(a3)
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  store i32 %and0, ptr %p1
+  ret i32 %or
+}
+
+define i32 @masked_merge_no_transform1(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; CHECK-I-LABEL: masked_merge_no_transform1:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    and a1, a0, a1
+; CHECK-I-NEXT:    not a4, a0
+; CHECK-I-NEXT:    and a0, a4, a2
+; CHECK-I-NEXT:    or a0, a1, a0
+; CHECK-I-NEXT:    sw a4, 0(a3)
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge_no_transform1:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    not a4, a0
+; CHECK-ZBB-NEXT:    andn a0, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a0
+; CHECK-ZBB-NEXT:    sw a4, 0(a3)
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  store i32 %not, ptr %p1
+  ret i32 %or
+}
+
+define i32 @masked_merge_no_transform2(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; CHECK-I-LABEL: masked_merge_no_transform2:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    and a1, a0, a1
+; CHECK-I-NEXT:    not a0, a0
+; CHECK-I-NEXT:    and a2, a0, a2
+; CHECK-I-NEXT:    or a0, a1, a2
+; CHECK-I-NEXT:    sw a2, 0(a3)
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge_no_transform2:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    andn a2, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a2
+; CHECK-ZBB-NEXT:    sw a2, 0(a3)
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  store i32 %and1, ptr %p1
+  ret i32 %or
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvqdotq.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvqdotq.ll
index 7ebbe3f..a189711 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvqdotq.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvqdotq.ll
@@ -232,13 +232,13 @@ define i32 @reduce_of_sext(<16 x i8> %a) {
 ;
 ; DOT-LABEL: reduce_of_sext:
 ; DOT:       # %bb.0: # %entry
+; DOT-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; DOT-NEXT:    vmv.v.i v9, 1
 ; DOT-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; DOT-NEXT:    vmv.v.i v9, 0
-; DOT-NEXT:    lui a0, 4112
-; DOT-NEXT:    addi a0, a0, 257
-; DOT-NEXT:    vqdot.vx v9, v8, a0
+; DOT-NEXT:    vmv.v.i v10, 0
+; DOT-NEXT:    vqdot.vv v10, v8, v9
 ; DOT-NEXT:    vmv.s.x v8, zero
-; DOT-NEXT:    vredsum.vs v8, v9, v8
+; DOT-NEXT:    vredsum.vs v8, v10, v8
 ; DOT-NEXT:    vmv.x.s a0, v8
 ; DOT-NEXT:    ret
 entry:
@@ -259,13 +259,13 @@ define i32 @reduce_of_zext(<16 x i8> %a) {
 ;
 ; DOT-LABEL: reduce_of_zext:
 ; DOT:       # %bb.0: # %entry
+; DOT-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; DOT-NEXT:    vmv.v.i v9, 1
 ; DOT-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; DOT-NEXT:    vmv.v.i v9, 0
-; DOT-NEXT:    lui a0, 4112
-; DOT-NEXT:    addi a0, a0, 257
-; DOT-NEXT:    vqdotu.vx v9, v8, a0
+; DOT-NEXT:    vmv.v.i v10, 0
+; DOT-NEXT:    vqdotu.vv v10, v8, v9
 ; DOT-NEXT:    vmv.s.x v8, zero
-; DOT-NEXT:    vredsum.vs v8, v9, v8
+; DOT-NEXT:    vredsum.vs v8, v10, v8
 ; DOT-NEXT:    vmv.x.s a0, v8
 ; DOT-NEXT:    ret
 entry:
@@ -646,23 +646,31 @@ entry:
 }
 
 define <1 x i32> @vqdotsu_vv_partial_reduce_v1i32_v4i8(<4 x i8> %a, <4 x i8> %b) {
-; CHECK-LABEL: vqdotsu_vv_partial_reduce_v1i32_v4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v8, v9
-; CHECK-NEXT:    vwmulsu.vv v9, v10, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v9, 3
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vadd.vv v9, v9, v10
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    ret
+; NODOT-LABEL: vqdotsu_vv_partial_reduce_v1i32_v4i8:
+; NODOT:       # %bb.0: # %entry
+; NODOT-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; NODOT-NEXT:    vsext.vf2 v10, v8
+; NODOT-NEXT:    vzext.vf2 v8, v9
+; NODOT-NEXT:    vwmulsu.vv v9, v10, v8
+; NODOT-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; NODOT-NEXT:    vslidedown.vi v8, v9, 3
+; NODOT-NEXT:    vslidedown.vi v10, v9, 2
+; NODOT-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; NODOT-NEXT:    vadd.vv v8, v8, v9
+; NODOT-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; NODOT-NEXT:    vslidedown.vi v9, v9, 1
+; NODOT-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; NODOT-NEXT:    vadd.vv v9, v9, v10
+; NODOT-NEXT:    vadd.vv v8, v9, v8
+; NODOT-NEXT:    ret
+;
+; DOT-LABEL: vqdotsu_vv_partial_reduce_v1i32_v4i8:
+; DOT:       # %bb.0: # %entry
+; DOT-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; DOT-NEXT:    vmv.s.x v10, zero
+; DOT-NEXT:    vqdotsu.vv v10, v8, v9
+; DOT-NEXT:    vmv1r.v v8, v10
+; DOT-NEXT:    ret
 entry:
   %a.sext = sext <4 x i8> %a to <4 x i32>
   %b.sext = zext <4 x i8> %b to <4 x i32>
@@ -672,23 +680,31 @@ entry:
 }
 
 define <1 x i32> @vqdotsu_vv_partial_reduce_swapped(<4 x i8> %a, <4 x i8> %b) {
-; CHECK-LABEL: vqdotsu_vv_partial_reduce_swapped:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v8, v9
-; CHECK-NEXT:    vwmulsu.vv v9, v10, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v9, 3
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vadd.vv v9, v9, v10
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    ret
+; NODOT-LABEL: vqdotsu_vv_partial_reduce_swapped:
+; NODOT:       # %bb.0: # %entry
+; NODOT-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; NODOT-NEXT:    vsext.vf2 v10, v8
+; NODOT-NEXT:    vzext.vf2 v8, v9
+; NODOT-NEXT:    vwmulsu.vv v9, v10, v8
+; NODOT-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; NODOT-NEXT:    vslidedown.vi v8, v9, 3
+; NODOT-NEXT:    vslidedown.vi v10, v9, 2
+; NODOT-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; NODOT-NEXT:    vadd.vv v8, v8, v9
+; NODOT-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; NODOT-NEXT:    vslidedown.vi v9, v9, 1
+; NODOT-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; NODOT-NEXT:    vadd.vv v9, v9, v10
+; NODOT-NEXT:    vadd.vv v8, v9, v8
+; NODOT-NEXT:    ret
+;
+; DOT-LABEL: vqdotsu_vv_partial_reduce_swapped:
+; DOT:       # %bb.0: # %entry
+; DOT-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; DOT-NEXT:    vmv.s.x v10, zero
+; DOT-NEXT:    vqdotsu.vv v10, v8, v9
+; DOT-NEXT:    vmv1r.v v8, v10
+; DOT-NEXT:    ret
 entry:
   %a.ext = sext <4 x i8> %a to <4 x i32>
   %b.ext = zext <4 x i8> %b to <4 x i32>
@@ -1065,222 +1081,291 @@ entry:
 
 ; Test legalization - type split
 define <64 x i32> @vqdotsu_vv_partial_v64i32_v256i8(<256 x i8> %a, <256 x i8> %b) {
-; CHECK-LABEL: vqdotsu_vv_partial_v64i32_v256i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 2
-; CHECK-NEXT:    add a1, a1, a2
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    li a2, 128
-; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vle8.v v0, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a3, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v24, v8, a0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vsext.vf2 v8, v24
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v0, a0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v4, v12
-; CHECK-NEXT:    vwmulsu.vv v24, v8, v4
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 5
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsext.vf2 v4, v8
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vzext.vf2 v0, v8
-; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a1)
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vwmaccsu.vv v24, v4, v0
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v4, v16, a0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vsext.vf2 v12, v4
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v4, v8, a0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v16, v4
-; CHECK-NEXT:    vwmulsu.vv v0, v12, v16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsext.vf2 v12, v16
-; CHECK-NEXT:    vzext.vf2 v20, v8
-; CHECK-NEXT:    vwmaccsu.vv v0, v12, v20
-; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 5
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v16, a1
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 5
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vslidedown.vx v8, v16, a1
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 5
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vsext.vf2 v16, v8
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vzext.vf2 v20, v8
-; CHECK-NEXT:    vwmaccsu.vv v24, v16, v20
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vx v16, v16, a1
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vslidedown.vx v8, v8, a1
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vsext.vf2 v8, v16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vzext.vf2 v20, v8
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl4r.v v8, (a1) # vscale x 32-byte Folded Reload
-; CHECK-NEXT:    vwmaccsu.vv v0, v8, v20
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v20, v8, a0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vsext.vf2 v12, v20
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs4r.v v12, (a1) # vscale x 32-byte Folded Spill
-; CHECK-NEXT:    vzext.vf2 v12, v8
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl4r.v v8, (a1) # vscale x 32-byte Folded Reload
-; CHECK-NEXT:    vwmaccsu.vv v24, v8, v12
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v12, v16, a0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vslidedown.vx v8, v16, a0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vsext.vf2 v16, v12
-; CHECK-NEXT:    vzext.vf2 v12, v8
-; CHECK-NEXT:    vwmaccsu.vv v0, v16, v12
-; CHECK-NEXT:    vmv8r.v v8, v24
-; CHECK-NEXT:    vmv8r.v v16, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; NODOT-LABEL: vqdotsu_vv_partial_v64i32_v256i8:
+; NODOT:       # %bb.0: # %entry
+; NODOT-NEXT:    addi sp, sp, -16
+; NODOT-NEXT:    .cfi_def_cfa_offset 16
+; NODOT-NEXT:    csrr a1, vlenb
+; NODOT-NEXT:    slli a1, a1, 3
+; NODOT-NEXT:    mv a2, a1
+; NODOT-NEXT:    slli a1, a1, 2
+; NODOT-NEXT:    add a1, a1, a2
+; NODOT-NEXT:    sub sp, sp, a1
+; NODOT-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; NODOT-NEXT:    csrr a1, vlenb
+; NODOT-NEXT:    slli a1, a1, 4
+; NODOT-NEXT:    add a1, sp, a1
+; NODOT-NEXT:    addi a1, a1, 16
+; NODOT-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; NODOT-NEXT:    csrr a1, vlenb
+; NODOT-NEXT:    slli a1, a1, 5
+; NODOT-NEXT:    add a1, sp, a1
+; NODOT-NEXT:    addi a1, a1, 16
+; NODOT-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; NODOT-NEXT:    addi a1, a0, 128
+; NODOT-NEXT:    li a2, 128
+; NODOT-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; NODOT-NEXT:    vle8.v v0, (a0)
+; NODOT-NEXT:    csrr a0, vlenb
+; NODOT-NEXT:    slli a0, a0, 3
+; NODOT-NEXT:    mv a3, a0
+; NODOT-NEXT:    slli a0, a0, 1
+; NODOT-NEXT:    add a0, a0, a3
+; NODOT-NEXT:    add a0, sp, a0
+; NODOT-NEXT:    addi a0, a0, 16
+; NODOT-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; NODOT-NEXT:    li a0, 32
+; NODOT-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; NODOT-NEXT:    vslidedown.vx v24, v8, a0
+; NODOT-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; NODOT-NEXT:    vsext.vf2 v8, v24
+; NODOT-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; NODOT-NEXT:    vslidedown.vx v12, v0, a0
+; NODOT-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; NODOT-NEXT:    vzext.vf2 v4, v12
+; NODOT-NEXT:    vwmulsu.vv v24, v8, v4
+; NODOT-NEXT:    csrr a3, vlenb
+; NODOT-NEXT:    slli a3, a3, 5
+; NODOT-NEXT:    add a3, sp, a3
+; NODOT-NEXT:    addi a3, a3, 16
+; NODOT-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; NODOT-NEXT:    vsext.vf2 v4, v8
+; NODOT-NEXT:    csrr a3, vlenb
+; NODOT-NEXT:    slli a3, a3, 3
+; NODOT-NEXT:    mv a4, a3
+; NODOT-NEXT:    slli a3, a3, 1
+; NODOT-NEXT:    add a3, a3, a4
+; NODOT-NEXT:    add a3, sp, a3
+; NODOT-NEXT:    addi a3, a3, 16
+; NODOT-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
+; NODOT-NEXT:    vzext.vf2 v0, v8
+; NODOT-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; NODOT-NEXT:    vle8.v v8, (a1)
+; NODOT-NEXT:    addi a1, sp, 16
+; NODOT-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; NODOT-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; NODOT-NEXT:    vwmaccsu.vv v24, v4, v0
+; NODOT-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; NODOT-NEXT:    vslidedown.vx v4, v16, a0
+; NODOT-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; NODOT-NEXT:    vsext.vf2 v12, v4
+; NODOT-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; NODOT-NEXT:    vslidedown.vx v4, v8, a0
+; NODOT-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; NODOT-NEXT:    vzext.vf2 v16, v4
+; NODOT-NEXT:    vwmulsu.vv v0, v12, v16
+; NODOT-NEXT:    csrr a1, vlenb
+; NODOT-NEXT:    slli a1, a1, 4
+; NODOT-NEXT:    add a1, sp, a1
+; NODOT-NEXT:    addi a1, a1, 16
+; NODOT-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; NODOT-NEXT:    vsext.vf2 v12, v16
+; NODOT-NEXT:    vzext.vf2 v20, v8
+; NODOT-NEXT:    vwmaccsu.vv v0, v12, v20
+; NODOT-NEXT:    li a1, 64
+; NODOT-NEXT:    csrr a2, vlenb
+; NODOT-NEXT:    slli a2, a2, 5
+; NODOT-NEXT:    add a2, sp, a2
+; NODOT-NEXT:    addi a2, a2, 16
+; NODOT-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; NODOT-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; NODOT-NEXT:    vslidedown.vx v8, v16, a1
+; NODOT-NEXT:    csrr a2, vlenb
+; NODOT-NEXT:    slli a2, a2, 5
+; NODOT-NEXT:    add a2, sp, a2
+; NODOT-NEXT:    addi a2, a2, 16
+; NODOT-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; NODOT-NEXT:    csrr a2, vlenb
+; NODOT-NEXT:    slli a2, a2, 3
+; NODOT-NEXT:    mv a3, a2
+; NODOT-NEXT:    slli a2, a2, 1
+; NODOT-NEXT:    add a2, a2, a3
+; NODOT-NEXT:    add a2, sp, a2
+; NODOT-NEXT:    addi a2, a2, 16
+; NODOT-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; NODOT-NEXT:    vslidedown.vx v8, v16, a1
+; NODOT-NEXT:    csrr a2, vlenb
+; NODOT-NEXT:    slli a2, a2, 3
+; NODOT-NEXT:    add a2, sp, a2
+; NODOT-NEXT:    addi a2, a2, 16
+; NODOT-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; NODOT-NEXT:    csrr a2, vlenb
+; NODOT-NEXT:    slli a2, a2, 5
+; NODOT-NEXT:    add a2, sp, a2
+; NODOT-NEXT:    addi a2, a2, 16
+; NODOT-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; NODOT-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; NODOT-NEXT:    vsext.vf2 v16, v8
+; NODOT-NEXT:    csrr a2, vlenb
+; NODOT-NEXT:    slli a2, a2, 3
+; NODOT-NEXT:    add a2, sp, a2
+; NODOT-NEXT:    addi a2, a2, 16
+; NODOT-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; NODOT-NEXT:    vzext.vf2 v20, v8
+; NODOT-NEXT:    vwmaccsu.vv v24, v16, v20
+; NODOT-NEXT:    csrr a2, vlenb
+; NODOT-NEXT:    slli a2, a2, 4
+; NODOT-NEXT:    add a2, sp, a2
+; NODOT-NEXT:    addi a2, a2, 16
+; NODOT-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; NODOT-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; NODOT-NEXT:    vslidedown.vx v16, v16, a1
+; NODOT-NEXT:    addi a2, sp, 16
+; NODOT-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; NODOT-NEXT:    vslidedown.vx v8, v8, a1
+; NODOT-NEXT:    csrr a1, vlenb
+; NODOT-NEXT:    slli a1, a1, 3
+; NODOT-NEXT:    mv a2, a1
+; NODOT-NEXT:    slli a1, a1, 1
+; NODOT-NEXT:    add a1, a1, a2
+; NODOT-NEXT:    add a1, sp, a1
+; NODOT-NEXT:    addi a1, a1, 16
+; NODOT-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; NODOT-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; NODOT-NEXT:    vsext.vf2 v8, v16
+; NODOT-NEXT:    csrr a1, vlenb
+; NODOT-NEXT:    slli a1, a1, 4
+; NODOT-NEXT:    add a1, sp, a1
+; NODOT-NEXT:    addi a1, a1, 16
+; NODOT-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
+; NODOT-NEXT:    csrr a1, vlenb
+; NODOT-NEXT:    slli a1, a1, 3
+; NODOT-NEXT:    mv a2, a1
+; NODOT-NEXT:    slli a1, a1, 1
+; NODOT-NEXT:    add a1, a1, a2
+; NODOT-NEXT:    add a1, sp, a1
+; NODOT-NEXT:    addi a1, a1, 16
+; NODOT-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; NODOT-NEXT:    vzext.vf2 v20, v8
+; NODOT-NEXT:    csrr a1, vlenb
+; NODOT-NEXT:    slli a1, a1, 4
+; NODOT-NEXT:    add a1, sp, a1
+; NODOT-NEXT:    addi a1, a1, 16
+; NODOT-NEXT:    vl4r.v v8, (a1) # vscale x 32-byte Folded Reload
+; NODOT-NEXT:    vwmaccsu.vv v0, v8, v20
+; NODOT-NEXT:    csrr a1, vlenb
+; NODOT-NEXT:    slli a1, a1, 5
+; NODOT-NEXT:    add a1, sp, a1
+; NODOT-NEXT:    addi a1, a1, 16
+; NODOT-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; NODOT-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; NODOT-NEXT:    vslidedown.vx v20, v8, a0
+; NODOT-NEXT:    csrr a1, vlenb
+; NODOT-NEXT:    slli a1, a1, 3
+; NODOT-NEXT:    add a1, sp, a1
+; NODOT-NEXT:    addi a1, a1, 16
+; NODOT-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; NODOT-NEXT:    vslidedown.vx v8, v8, a0
+; NODOT-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; NODOT-NEXT:    vsext.vf2 v12, v20
+; NODOT-NEXT:    csrr a1, vlenb
+; NODOT-NEXT:    slli a1, a1, 5
+; NODOT-NEXT:    add a1, sp, a1
+; NODOT-NEXT:    addi a1, a1, 16
+; NODOT-NEXT:    vs4r.v v12, (a1) # vscale x 32-byte Folded Spill
+; NODOT-NEXT:    vzext.vf2 v12, v8
+; NODOT-NEXT:    csrr a1, vlenb
+; NODOT-NEXT:    slli a1, a1, 5
+; NODOT-NEXT:    add a1, sp, a1
+; NODOT-NEXT:    addi a1, a1, 16
+; NODOT-NEXT:    vl4r.v v8, (a1) # vscale x 32-byte Folded Reload
+; NODOT-NEXT:    vwmaccsu.vv v24, v8, v12
+; NODOT-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; NODOT-NEXT:    vslidedown.vx v12, v16, a0
+; NODOT-NEXT:    csrr a1, vlenb
+; NODOT-NEXT:    slli a1, a1, 3
+; NODOT-NEXT:    mv a2, a1
+; NODOT-NEXT:    slli a1, a1, 1
+; NODOT-NEXT:    add a1, a1, a2
+; NODOT-NEXT:    add a1, sp, a1
+; NODOT-NEXT:    addi a1, a1, 16
+; NODOT-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; NODOT-NEXT:    vslidedown.vx v8, v16, a0
+; NODOT-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; NODOT-NEXT:    vsext.vf2 v16, v12
+; NODOT-NEXT:    vzext.vf2 v12, v8
+; NODOT-NEXT:    vwmaccsu.vv v0, v16, v12
+; NODOT-NEXT:    vmv8r.v v8, v24
+; NODOT-NEXT:    vmv8r.v v16, v0
+; NODOT-NEXT:    csrr a0, vlenb
+; NODOT-NEXT:    slli a0, a0, 3
+; NODOT-NEXT:    mv a1, a0
+; NODOT-NEXT:    slli a0, a0, 2
+; NODOT-NEXT:    add a0, a0, a1
+; NODOT-NEXT:    add sp, sp, a0
+; NODOT-NEXT:    .cfi_def_cfa sp, 16
+; NODOT-NEXT:    addi sp, sp, 16
+; NODOT-NEXT:    .cfi_def_cfa_offset 0
+; NODOT-NEXT:    ret
+;
+; DOT-LABEL: vqdotsu_vv_partial_v64i32_v256i8:
+; DOT:       # %bb.0: # %entry
+; DOT-NEXT:    addi sp, sp, -16
+; DOT-NEXT:    .cfi_def_cfa_offset 16
+; DOT-NEXT:    csrr a1, vlenb
+; DOT-NEXT:    slli a1, a1, 5
+; DOT-NEXT:    sub sp, sp, a1
+; DOT-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; DOT-NEXT:    csrr a1, vlenb
+; DOT-NEXT:    slli a1, a1, 3
+; DOT-NEXT:    mv a2, a1
+; DOT-NEXT:    slli a1, a1, 1
+; DOT-NEXT:    add a1, a1, a2
+; DOT-NEXT:    add a1, sp, a1
+; DOT-NEXT:    addi a1, a1, 16
+; DOT-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; DOT-NEXT:    csrr a1, vlenb
+; DOT-NEXT:    slli a1, a1, 4
+; DOT-NEXT:    add a1, sp, a1
+; DOT-NEXT:    addi a1, a1, 16
+; DOT-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; DOT-NEXT:    addi a1, a0, 128
+; DOT-NEXT:    li a2, 128
+; DOT-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; DOT-NEXT:    vle8.v v8, (a0)
+; DOT-NEXT:    csrr a0, vlenb
+; DOT-NEXT:    slli a0, a0, 3
+; DOT-NEXT:    add a0, sp, a0
+; DOT-NEXT:    addi a0, a0, 16
+; DOT-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; DOT-NEXT:    li a0, 32
+; DOT-NEXT:    vle8.v v8, (a1)
+; DOT-NEXT:    addi a1, sp, 16
+; DOT-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; DOT-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; DOT-NEXT:    vmv.v.i v24, 0
+; DOT-NEXT:    vmv.v.i v0, 0
+; DOT-NEXT:    csrr a0, vlenb
+; DOT-NEXT:    slli a0, a0, 4
+; DOT-NEXT:    add a0, sp, a0
+; DOT-NEXT:    addi a0, a0, 16
+; DOT-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; DOT-NEXT:    csrr a0, vlenb
+; DOT-NEXT:    slli a0, a0, 3
+; DOT-NEXT:    add a0, sp, a0
+; DOT-NEXT:    addi a0, a0, 16
+; DOT-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; DOT-NEXT:    vqdotsu.vv v0, v16, v8
+; DOT-NEXT:    csrr a0, vlenb
+; DOT-NEXT:    slli a0, a0, 3
+; DOT-NEXT:    mv a1, a0
+; DOT-NEXT:    slli a0, a0, 1
+; DOT-NEXT:    add a0, a0, a1
+; DOT-NEXT:    add a0, sp, a0
+; DOT-NEXT:    addi a0, a0, 16
+; DOT-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; DOT-NEXT:    addi a0, sp, 16
+; DOT-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; DOT-NEXT:    vqdotsu.vv v24, v16, v8
+; DOT-NEXT:    vmv.v.v v8, v0
+; DOT-NEXT:    vmv.v.v v16, v24
+; DOT-NEXT:    csrr a0, vlenb
+; DOT-NEXT:    slli a0, a0, 5
+; DOT-NEXT:    add sp, sp, a0
+; DOT-NEXT:    .cfi_def_cfa sp, 16
+; DOT-NEXT:    addi sp, sp, 16
+; DOT-NEXT:    .cfi_def_cfa_offset 0
+; DOT-NEXT:    ret
 entry:
   %a.ext = sext <256 x i8> %a to <256 x i32>
   %b.ext = zext <256 x i8> %b to <256 x i32>
@@ -1289,6 +1374,56 @@ entry:
   ret <64 x i32> %res
 }
 
+; Test legalization - integer promote
+define <4 x i31> @vqdotsu_vv_partial_v4i31_v16i7(<16 x i7> %a, <16 x i7> %b) {
+; NODOT-LABEL: vqdotsu_vv_partial_v4i31_v16i7:
+; NODOT:       # %bb.0: # %entry
+; NODOT-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; NODOT-NEXT:    vzext.vf4 v12, v8
+; NODOT-NEXT:    li a0, 127
+; NODOT-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; NODOT-NEXT:    vand.vx v16, v9, a0
+; NODOT-NEXT:    lui a0, 524288
+; NODOT-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; NODOT-NEXT:    vsll.vi v8, v12, 25
+; NODOT-NEXT:    addi a0, a0, -1
+; NODOT-NEXT:    vsra.vi v8, v8, 25
+; NODOT-NEXT:    vzext.vf4 v12, v16
+; NODOT-NEXT:    vmul.vv v8, v12, v8
+; NODOT-NEXT:    vand.vx v8, v8, a0
+; NODOT-NEXT:    vsetivli zero, 4, e32, m4, ta, ma
+; NODOT-NEXT:    vslidedown.vi v12, v8, 12
+; NODOT-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; NODOT-NEXT:    vadd.vv v16, v12, v8
+; NODOT-NEXT:    vsetivli zero, 4, e32, m4, ta, ma
+; NODOT-NEXT:    vslidedown.vi v12, v8, 8
+; NODOT-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
+; NODOT-NEXT:    vslidedown.vi v8, v8, 4
+; NODOT-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; NODOT-NEXT:    vadd.vv v8, v8, v12
+; NODOT-NEXT:    vadd.vv v8, v8, v16
+; NODOT-NEXT:    ret
+;
+; DOT-LABEL: vqdotsu_vv_partial_v4i31_v16i7:
+; DOT:       # %bb.0: # %entry
+; DOT-NEXT:    li a0, 127
+; DOT-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; DOT-NEXT:    vadd.vv v8, v8, v8
+; DOT-NEXT:    vand.vx v9, v9, a0
+; DOT-NEXT:    vsra.vi v10, v8, 1
+; DOT-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; DOT-NEXT:    vmv.v.i v8, 0
+; DOT-NEXT:    vqdotsu.vv v8, v10, v9
+; DOT-NEXT:    ret
+entry:
+  %a.ext = sext <16 x i7> %a to <16 x i31>
+  %b.ext = zext <16 x i7> %b to <16 x i31>
+  %mul = mul <16 x i31> %b.ext, %a.ext
+  %res = call <4 x i31> @llvm.experimental.vector.partial.reduce.add(<4 x i31> zeroinitializer, <16 x i31> %mul)
+  ret <4 x i31> %res
+}
+
+
 ; Test legalization - expand
 define <1 x i32> @vqdotsu_vv_partial_v1i32_v2i8(<2 x i8> %a, <2 x i8> %b) {
 ; CHECK-LABEL: vqdotsu_vv_partial_v1i32_v2i8:
diff --git a/llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll
index 5272f1b..87a984b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll
@@ -232,13 +232,13 @@ define i32 @reduce_of_sext(<vscale x 16 x i8> %a) {
 ;
 ; DOT-LABEL: reduce_of_sext:
 ; DOT:       # %bb.0: # %entry
+; DOT-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; DOT-NEXT:    vmv.v.i v10, 1
 ; DOT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; DOT-NEXT:    vmv.v.i v10, 0
-; DOT-NEXT:    lui a0, 4112
-; DOT-NEXT:    addi a0, a0, 257
-; DOT-NEXT:    vqdot.vx v10, v8, a0
+; DOT-NEXT:    vmv.v.i v12, 0
+; DOT-NEXT:    vqdot.vv v12, v8, v10
 ; DOT-NEXT:    vmv.s.x v8, zero
-; DOT-NEXT:    vredsum.vs v8, v10, v8
+; DOT-NEXT:    vredsum.vs v8, v12, v8
 ; DOT-NEXT:    vmv.x.s a0, v8
 ; DOT-NEXT:    ret
 entry:
@@ -259,13 +259,13 @@ define i32 @reduce_of_zext(<vscale x 16 x i8> %a) {
 ;
 ; DOT-LABEL: reduce_of_zext:
 ; DOT:       # %bb.0: # %entry
+; DOT-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; DOT-NEXT:    vmv.v.i v10, 1
 ; DOT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; DOT-NEXT:    vmv.v.i v10, 0
-; DOT-NEXT:    lui a0, 4112
-; DOT-NEXT:    addi a0, a0, 257
-; DOT-NEXT:    vqdotu.vx v10, v8, a0
+; DOT-NEXT:    vmv.v.i v12, 0
+; DOT-NEXT:    vqdotu.vv v12, v8, v10
 ; DOT-NEXT:    vmv.s.x v8, zero
-; DOT-NEXT:    vredsum.vs v8, v10, v8
+; DOT-NEXT:    vredsum.vs v8, v12, v8
 ; DOT-NEXT:    vmv.x.s a0, v8
 ; DOT-NEXT:    ret
 entry:
@@ -910,22 +910,30 @@ entry:
 }
 
 define <vscale x 1 x i32> @partial_reduce_vqdotsu(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b) {
-; CHECK-LABEL: partial_reduce_vqdotsu:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vsext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v11, v9
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    vwmulsu.vv v8, v10, v11
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v10, v9, a0
-; CHECK-NEXT:    vslidedown.vx v11, v8, a0
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v10, v8
-; CHECK-NEXT:    vadd.vv v9, v11, v9
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    ret
+; NODOT-LABEL: partial_reduce_vqdotsu:
+; NODOT:       # %bb.0: # %entry
+; NODOT-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; NODOT-NEXT:    vsext.vf2 v10, v8
+; NODOT-NEXT:    vzext.vf2 v11, v9
+; NODOT-NEXT:    csrr a0, vlenb
+; NODOT-NEXT:    vwmulsu.vv v8, v10, v11
+; NODOT-NEXT:    srli a0, a0, 3
+; NODOT-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; NODOT-NEXT:    vslidedown.vx v10, v9, a0
+; NODOT-NEXT:    vslidedown.vx v11, v8, a0
+; NODOT-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; NODOT-NEXT:    vadd.vv v8, v10, v8
+; NODOT-NEXT:    vadd.vv v9, v11, v9
+; NODOT-NEXT:    vadd.vv v8, v9, v8
+; NODOT-NEXT:    ret
+;
+; DOT-LABEL: partial_reduce_vqdotsu:
+; DOT:       # %bb.0: # %entry
+; DOT-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; DOT-NEXT:    vmv.v.i v10, 0
+; DOT-NEXT:    vqdotsu.vv v10, v8, v9
+; DOT-NEXT:    vmv1r.v v8, v10
+; DOT-NEXT:    ret
 entry:
   %a.sext = sext <vscale x 4 x i8> %a to <vscale x 4 x i32>
   %b.sext = zext <vscale x 4 x i8> %b to <vscale x 4 x i32>
diff --git a/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll
index 1517e52..efc8243 100644
--- a/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll
@@ -8,16 +8,13 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+zbb < %s \
 ; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV64,RV64ZBB
 
-; TODO: Should we convert these to X ^ ((X ^ Y) & M) form when Zbb isn't
-; present?
 
 define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 ; CHECK-I-LABEL: out8:
 ; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    not a2, a2
-; CHECK-I-NEXT:    and a1, a1, a2
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out8:
@@ -36,10 +33,9 @@ define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 define i16 @out16(i16 %x, i16 %y, i16 %mask) {
 ; CHECK-I-LABEL: out16:
 ; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    not a2, a2
-; CHECK-I-NEXT:    and a1, a1, a2
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out16:
@@ -58,10 +54,9 @@ define i16 @out16(i16 %x, i16 %y, i16 %mask) {
 define i32 @out32(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out32:
 ; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    not a2, a2
-; CHECK-I-NEXT:    and a1, a1, a2
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out32:
@@ -80,22 +75,19 @@ define i32 @out32(i32 %x, i32 %y, i32 %mask) {
 define i64 @out64(i64 %x, i64 %y, i64 %mask) {
 ; RV32I-LABEL: out64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    xor a0, a0, a2
+; RV32I-NEXT:    xor a1, a1, a3
 ; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    not a4, a4
-; RV32I-NEXT:    not a5, a5
-; RV32I-NEXT:    and a3, a3, a5
-; RV32I-NEXT:    and a2, a2, a4
-; RV32I-NEXT:    or a0, a0, a2
-; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    xor a0, a0, a2
+; RV32I-NEXT:    xor a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: out64:
 ; RV64I:       # %bb.0:
+; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    not a2, a2
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: out64:
@@ -660,10 +652,9 @@ define i32 @in_constant_varx_mone_invmask(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_varx_42(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_varx_42:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    not a1, a2
-; CHECK-I-NEXT:    and a0, a2, a0
-; CHECK-I-NEXT:    andi a1, a1, 42
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xori a0, a0, 42
+; CHECK-I-NEXT:    and a0, a0, a2
+; CHECK-I-NEXT:    xori a0, a0, 42
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_varx_42:
@@ -704,10 +695,9 @@ define i32 @in_constant_varx_42(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_varx_42_invmask(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_varx_42_invmask:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    not a1, a2
-; CHECK-I-NEXT:    and a0, a1, a0
-; CHECK-I-NEXT:    andi a1, a2, 42
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xori a1, a0, 42
+; CHECK-I-NEXT:    and a1, a1, a2
+; CHECK-I-NEXT:    xor a0, a1, a0
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_varx_42_invmask:
@@ -812,10 +802,9 @@ define i32 @in_constant_mone_vary_invmask(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_42_vary(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_42_vary:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    not a0, a2
-; CHECK-I-NEXT:    andi a2, a2, 42
-; CHECK-I-NEXT:    and a0, a0, a1
-; CHECK-I-NEXT:    or a0, a2, a0
+; CHECK-I-NEXT:    xori a0, a1, 42
+; CHECK-I-NEXT:    and a0, a0, a2
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_42_vary:
@@ -855,10 +844,9 @@ define i32 @in_constant_42_vary(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_42_vary_invmask(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_42_vary_invmask:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    not a0, a2
-; CHECK-I-NEXT:    andi a0, a0, 42
-; CHECK-I-NEXT:    and a1, a2, a1
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xori a0, a1, 42
+; CHECK-I-NEXT:    and a0, a0, a2
+; CHECK-I-NEXT:    xori a0, a0, 42
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_42_vary_invmask:
diff --git a/llvm/test/CodeGen/RISCV/xqcibm-extract.ll b/llvm/test/CodeGen/RISCV/xqcibm-extract.ll
index 3f5b949..920dd02 100644
--- a/llvm/test/CodeGen/RISCV/xqcibm-extract.ll
+++ b/llvm/test/CodeGen/RISCV/xqcibm-extract.ll
@@ -231,3 +231,103 @@ define i64 @sexti32_i64_2(i32 %a) {
   %1 = sext i32 %a to i64
   ret i64 %1
 }
+
+define i32 @extu_from_and_i32(i32 %x) {
+; RV32I-LABEL: extu_from_and_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a0, a0, 20
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: extu_from_and_i32:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    qc.extu a0, a0, 12, 0
+; RV32XQCIBM-NEXT:    ret
+  %a = and i32 %x, 4095
+  ret i32 %a
+}
+
+define i64 @extu_from_and_i64(i64 %x) {
+; RV32I-LABEL: extu_from_and_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a0, a0, 20
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: extu_from_and_i64:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    qc.extu a0, a0, 12, 0
+; RV32XQCIBM-NEXT:    li a1, 0
+; RV32XQCIBM-NEXT:    ret
+  %a = and i64 %x, 4095
+  ret i64 %a
+}
+
+define i32 @extu_from_and_lshr_i32(i32 %x) {
+; RV32I-LABEL: extu_from_and_lshr_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 6
+; RV32I-NEXT:    srli a0, a0, 29
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: extu_from_and_lshr_i32:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    qc.extu a0, a0, 3, 23
+; RV32XQCIBM-NEXT:    ret
+  %shifted = lshr i32 %x, 23
+  %masked = and i32 %shifted, 7
+  ret i32 %masked
+}
+
+define i64 @extu_from_and_lshr_i64(i64 %x) {
+; RV32I-LABEL: extu_from_and_lshr_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a1, 6
+; RV32I-NEXT:    srli a0, a0, 20
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: extu_from_and_lshr_i64:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    qc.extu a0, a1, 12, 14
+; RV32XQCIBM-NEXT:    li a1, 0
+; RV32XQCIBM-NEXT:    ret
+  %shifted = lshr i64 %x, 46
+  %masked = and i64 %shifted, 4095
+  ret i64 %masked
+}
+
+define i32 @extu_from_lshr_and_i32(i32 %x) {
+; RV32I-LABEL: extu_from_lshr_and_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    srli a0, a0, 20
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: extu_from_lshr_and_i32:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    qc.extu a0, a0, 12, 12
+; RV32XQCIBM-NEXT:    ret
+  %masked = and i32 %x, 16773120
+  %shifted = lshr i32 %masked, 12
+  ret i32 %shifted
+}
+
+define i64 @extu_from_lshr_and_i64(i64 %x) {
+; RV32I-LABEL: extu_from_lshr_and_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    srli a0, a0, 20
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: extu_from_lshr_and_i64:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    qc.extu a0, a0, 12, 12
+; RV32XQCIBM-NEXT:    li a1, 0
+; RV32XQCIBM-NEXT:    ret
+  %masked = and i64 %x, 16773120
+  %shifted = lshr i64 %masked, 12
+  ret i64 %shifted
+}
diff --git a/llvm/test/CodeGen/SystemZ/fold-masked-merge.ll b/llvm/test/CodeGen/SystemZ/fold-masked-merge.ll
new file mode 100644
index 0000000..c0143455
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fold-masked-merge.ll
@@ -0,0 +1,277 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s --check-prefix=NO-MISC3
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s --check-prefix=MISC3
+
+; test that masked-merge code is generated as "xor;and;xor" sequence or
+; "andn ; and; or" if and-not is available.
+
+define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
+; NO-MISC3-LABEL: masked_merge0:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    xr %r3, %r4
+; NO-MISC3-NEXT:    nr %r2, %r3
+; NO-MISC3-NEXT:    xr %r2, %r4
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: masked_merge0:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    nr %r3, %r2
+; MISC3-NEXT:    ncrk %r2, %r4, %r2
+; MISC3-NEXT:    or %r2, %r3
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) {
+; NO-MISC3-LABEL: masked_merge1:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    xr %r3, %r4
+; NO-MISC3-NEXT:    nr %r2, %r3
+; NO-MISC3-NEXT:    xr %r2, %r4
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: masked_merge1:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    ncrk %r0, %r4, %r2
+; MISC3-NEXT:    nr %r2, %r3
+; MISC3-NEXT:    or %r2, %r0
+; MISC3-NEXT:    br %r14
+  %and0 = and i16 %a0, %a1
+  %not = xor i16 %a0, -1
+  %and1 = and i16 %a2, %not
+  %or = or i16 %and0, %and1
+  ret i16 %or
+}
+
+define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) {
+; NO-MISC3-LABEL: masked_merge2:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    lr %r2, %r3
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: masked_merge2:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    lr %r2, %r3
+; MISC3-NEXT:    br %r14
+  %not = xor i8 %a0, -1
+  %and0 = and i8 %not, %a1
+  %and1 = and i8 %a1, %a0
+  %or = or i8 %and0, %and1
+  ret i8 %or
+}
+
+define i64 @masked_merge3(i64 %a0, i64 %a1, i64 %a2) {
+; NO-MISC3-LABEL: masked_merge3:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    lcgr %r0, %r4
+; NO-MISC3-NEXT:    aghi %r0, -1
+; NO-MISC3-NEXT:    xgr %r3, %r0
+; NO-MISC3-NEXT:    ngr %r3, %r2
+; NO-MISC3-NEXT:    xgr %r3, %r2
+; NO-MISC3-NEXT:    xgrk %r2, %r3, %r0
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: masked_merge3:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    lcgr %r0, %r2
+; MISC3-NEXT:    aghi %r0, -1
+; MISC3-NEXT:    ncgrk %r0, %r0, %r4
+; MISC3-NEXT:    ncgrk %r2, %r2, %r3
+; MISC3-NEXT:    ogr %r2, %r0
+; MISC3-NEXT:    br %r14
+  %v0 = xor i64 %a1, -1
+  %v1 = xor i64 %a2, -1
+  %not = xor i64 %a0, -1
+  %and0 = and i64 %not, %v1
+  %and1 = and i64 %v0, %a0
+  %or = or i64 %and0, %and1
+  ret i64 %or
+}
+
+define i32 @not_a_masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
+; NO-MISC3-LABEL: not_a_masked_merge0:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    lcr %r0, %r2
+; NO-MISC3-NEXT:    nr %r3, %r2
+; NO-MISC3-NEXT:    nr %r0, %r4
+; NO-MISC3-NEXT:    ork %r2, %r3, %r0
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: not_a_masked_merge0:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    lcr %r0, %r2
+; MISC3-NEXT:    nr %r3, %r2
+; MISC3-NEXT:    nr %r0, %r4
+; MISC3-NEXT:    ork %r2, %r3, %r0
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not_a_not = sub i32 0, %a0
+  %and1 = and i32 %not_a_not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; NO-MISC3-LABEL: not_a_masked_merge1:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    xilf %r5, 4294967295
+; NO-MISC3-NEXT:    nr %r2, %r3
+; NO-MISC3-NEXT:    nr %r4, %r5
+; NO-MISC3-NEXT:    or %r2, %r4
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: not_a_masked_merge1:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    nr %r2, %r3
+; MISC3-NEXT:    ncrk %r0, %r4, %r5
+; MISC3-NEXT:    or %r2, %r0
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a3, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge2(i32 %a0, i32 %a1, i32 %a2) {
+; NO-MISC3-LABEL: not_a_masked_merge2:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    or %r3, %r2
+; NO-MISC3-NEXT:    xilf %r2, 4294967295
+; NO-MISC3-NEXT:    nr %r2, %r4
+; NO-MISC3-NEXT:    or %r2, %r3
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: not_a_masked_merge2:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    or %r3, %r2
+; MISC3-NEXT:    ncrk %r2, %r4, %r2
+; MISC3-NEXT:    or %r2, %r3
+; MISC3-NEXT:    br %r14
+  %not_an_and0 = or i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %not_an_and0, %and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge3(i32 %a0, i32 %a1, i32 %a2) {
+; NO-MISC3-LABEL: not_a_masked_merge3:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    nr %r3, %r2
+; NO-MISC3-NEXT:    xr %r2, %r4
+; NO-MISC3-NEXT:    xilf %r2, 4294967295
+; NO-MISC3-NEXT:    or %r2, %r3
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: not_a_masked_merge3:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    nr %r3, %r2
+; MISC3-NEXT:    xr %r2, %r4
+; MISC3-NEXT:    ocrk %r2, %r3, %r2
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %not_an_and1 = xor i32 %not, %a2
+  %or = or i32 %and0, %not_an_and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge4(i32 %a0, i32 %a1, i32 %a2) {
+; NO-MISC3-LABEL: not_a_masked_merge4:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    nr %r2, %r3
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: not_a_masked_merge4:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    nr %r2, %r3
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a2, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i32 @masked_merge_no_transform0(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; NO-MISC3-LABEL: masked_merge_no_transform0:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    nr %r3, %r2
+; NO-MISC3-NEXT:    xilf %r2, 4294967295
+; NO-MISC3-NEXT:    nr %r2, %r4
+; NO-MISC3-NEXT:    or %r2, %r3
+; NO-MISC3-NEXT:    st %r3, 0(%r5)
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: masked_merge_no_transform0:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    nr %r3, %r2
+; MISC3-NEXT:    ncrk %r2, %r4, %r2
+; MISC3-NEXT:    or %r2, %r3
+; MISC3-NEXT:    st %r3, 0(%r5)
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  store i32 %and0, ptr %p1
+  ret i32 %or
+}
+
+define i32 @masked_merge_no_transform1(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; NO-MISC3-LABEL: masked_merge_no_transform1:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    nrk %r0, %r2, %r3
+; NO-MISC3-NEXT:    xilf %r2, 4294967295
+; NO-MISC3-NEXT:    nr %r4, %r2
+; NO-MISC3-NEXT:    or %r0, %r4
+; NO-MISC3-NEXT:    st %r2, 0(%r5)
+; NO-MISC3-NEXT:    lr %r2, %r0
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: masked_merge_no_transform1:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    nrk %r0, %r2, %r3
+; MISC3-NEXT:    ncrk %r1, %r4, %r2
+; MISC3-NEXT:    xilf %r2, 4294967295
+; MISC3-NEXT:    or %r0, %r1
+; MISC3-NEXT:    st %r2, 0(%r5)
+; MISC3-NEXT:    lr %r2, %r0
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  store i32 %not, ptr %p1
+  ret i32 %or
+}
+
+define i32 @masked_merge_no_transform2(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; NO-MISC3-LABEL: masked_merge_no_transform2:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    nr %r3, %r2
+; NO-MISC3-NEXT:    xilf %r2, 4294967295
+; NO-MISC3-NEXT:    nr %r4, %r2
+; NO-MISC3-NEXT:    ork %r2, %r3, %r4
+; NO-MISC3-NEXT:    st %r4, 0(%r5)
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: masked_merge_no_transform2:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    nr %r3, %r2
+; MISC3-NEXT:    ncrk %r0, %r4, %r2
+; MISC3-NEXT:    ork %r2, %r3, %r0
+; MISC3-NEXT:    st %r0, 0(%r5)
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  store i32 %and1, ptr %p1
+  ret i32 %or
+}
diff --git a/llvm/test/CodeGen/Thumb2/float-cmp.ll b/llvm/test/CodeGen/Thumb2/float-cmp.ll
index 73e0063..ed80544 100644
--- a/llvm/test/CodeGen/Thumb2/float-cmp.ll
+++ b/llvm/test/CodeGen/Thumb2/float-cmp.ll
@@ -200,8 +200,13 @@ define i1 @cmp_d_one(double %a, double %b) {
 ; CHECK-LABEL: cmp_d_one:
 ; NONE: bl __aeabi_dcmpeq
 ; NONE: bl __aeabi_dcmpun
-; SP: bl __aeabi_dcmpeq
 ; SP: bl __aeabi_dcmpun
+; SP: eor r8, r0, #1
+; SP: bl __aeabi_dcmpeq
+; SP-NEXT: clz r0, r0
+; SP-NEXT: lsrs r0, r0, #5
+; SP-NEXT: ands.w r0, r0, r8
+
 ; DP: vcmp.f64
 ; DP: movmi r0, #1
 ; DP: movgt r0, #1
diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
index 185c46a..e3607e1 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
@@ -4465,203 +4465,139 @@ define <16 x i8> @bitselect_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v16i8:
 ; NO-SIMD128:         .functype bitselect_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.and $push0=, $16, $32
-; NO-SIMD128-NEXT:    i32.const $push1=, -1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $pop1
-; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $48
-; NO-SIMD128-NEXT:    i32.or $push4=, $pop0, $pop3
-; NO-SIMD128-NEXT:    i32.store8 15($0), $pop4
-; NO-SIMD128-NEXT:    i32.and $push5=, $15, $31
-; NO-SIMD128-NEXT:    i32.const $push79=, -1
-; NO-SIMD128-NEXT:    i32.xor $push6=, $15, $pop79
-; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $47
-; NO-SIMD128-NEXT:    i32.or $push8=, $pop5, $pop7
-; NO-SIMD128-NEXT:    i32.store8 14($0), $pop8
-; NO-SIMD128-NEXT:    i32.and $push9=, $14, $30
-; NO-SIMD128-NEXT:    i32.const $push78=, -1
-; NO-SIMD128-NEXT:    i32.xor $push10=, $14, $pop78
-; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $46
-; NO-SIMD128-NEXT:    i32.or $push12=, $pop9, $pop11
-; NO-SIMD128-NEXT:    i32.store8 13($0), $pop12
-; NO-SIMD128-NEXT:    i32.and $push13=, $13, $29
-; NO-SIMD128-NEXT:    i32.const $push77=, -1
-; NO-SIMD128-NEXT:    i32.xor $push14=, $13, $pop77
-; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $45
-; NO-SIMD128-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-NEXT:    i32.store8 12($0), $pop16
-; NO-SIMD128-NEXT:    i32.and $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.const $push76=, -1
-; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $pop76
-; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $44
-; NO-SIMD128-NEXT:    i32.or $push20=, $pop17, $pop19
-; NO-SIMD128-NEXT:    i32.store8 11($0), $pop20
-; NO-SIMD128-NEXT:    i32.and $push21=, $11, $27
-; NO-SIMD128-NEXT:    i32.const $push75=, -1
-; NO-SIMD128-NEXT:    i32.xor $push22=, $11, $pop75
-; NO-SIMD128-NEXT:    i32.and $push23=, $pop22, $43
-; NO-SIMD128-NEXT:    i32.or $push24=, $pop21, $pop23
-; NO-SIMD128-NEXT:    i32.store8 10($0), $pop24
-; NO-SIMD128-NEXT:    i32.and $push25=, $10, $26
-; NO-SIMD128-NEXT:    i32.const $push74=, -1
-; NO-SIMD128-NEXT:    i32.xor $push26=, $10, $pop74
-; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $42
-; NO-SIMD128-NEXT:    i32.or $push28=, $pop25, $pop27
-; NO-SIMD128-NEXT:    i32.store8 9($0), $pop28
-; NO-SIMD128-NEXT:    i32.and $push29=, $9, $25
-; NO-SIMD128-NEXT:    i32.const $push73=, -1
-; NO-SIMD128-NEXT:    i32.xor $push30=, $9, $pop73
-; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $41
-; NO-SIMD128-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop32
-; NO-SIMD128-NEXT:    i32.and $push33=, $8, $24
-; NO-SIMD128-NEXT:    i32.const $push72=, -1
-; NO-SIMD128-NEXT:    i32.xor $push34=, $8, $pop72
-; NO-SIMD128-NEXT:    i32.and $push35=, $pop34, $40
-; NO-SIMD128-NEXT:    i32.or $push36=, $pop33, $pop35
-; NO-SIMD128-NEXT:    i32.store8 7($0), $pop36
-; NO-SIMD128-NEXT:    i32.and $push37=, $7, $23
-; NO-SIMD128-NEXT:    i32.const $push71=, -1
-; NO-SIMD128-NEXT:    i32.xor $push38=, $7, $pop71
-; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $39
-; NO-SIMD128-NEXT:    i32.or $push40=, $pop37, $pop39
-; NO-SIMD128-NEXT:    i32.store8 6($0), $pop40
-; NO-SIMD128-NEXT:    i32.and $push41=, $6, $22
-; NO-SIMD128-NEXT:    i32.const $push70=, -1
-; NO-SIMD128-NEXT:    i32.xor $push42=, $6, $pop70
-; NO-SIMD128-NEXT:    i32.and $push43=, $pop42, $38
-; NO-SIMD128-NEXT:    i32.or $push44=, $pop41, $pop43
-; NO-SIMD128-NEXT:    i32.store8 5($0), $pop44
-; NO-SIMD128-NEXT:    i32.and $push45=, $5, $21
-; NO-SIMD128-NEXT:    i32.const $push69=, -1
-; NO-SIMD128-NEXT:    i32.xor $push46=, $5, $pop69
-; NO-SIMD128-NEXT:    i32.and $push47=, $pop46, $37
-; NO-SIMD128-NEXT:    i32.or $push48=, $pop45, $pop47
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop48
-; NO-SIMD128-NEXT:    i32.and $push49=, $4, $20
-; NO-SIMD128-NEXT:    i32.const $push68=, -1
-; NO-SIMD128-NEXT:    i32.xor $push50=, $4, $pop68
-; NO-SIMD128-NEXT:    i32.and $push51=, $pop50, $36
-; NO-SIMD128-NEXT:    i32.or $push52=, $pop49, $pop51
-; NO-SIMD128-NEXT:    i32.store8 3($0), $pop52
-; NO-SIMD128-NEXT:    i32.and $push53=, $3, $19
-; NO-SIMD128-NEXT:    i32.const $push67=, -1
-; NO-SIMD128-NEXT:    i32.xor $push54=, $3, $pop67
-; NO-SIMD128-NEXT:    i32.and $push55=, $pop54, $35
-; NO-SIMD128-NEXT:    i32.or $push56=, $pop53, $pop55
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop56
-; NO-SIMD128-NEXT:    i32.and $push57=, $2, $18
-; NO-SIMD128-NEXT:    i32.const $push66=, -1
-; NO-SIMD128-NEXT:    i32.xor $push58=, $2, $pop66
-; NO-SIMD128-NEXT:    i32.and $push59=, $pop58, $34
-; NO-SIMD128-NEXT:    i32.or $push60=, $pop57, $pop59
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop60
-; NO-SIMD128-NEXT:    i32.and $push61=, $1, $17
-; NO-SIMD128-NEXT:    i32.const $push65=, -1
-; NO-SIMD128-NEXT:    i32.xor $push62=, $1, $pop65
-; NO-SIMD128-NEXT:    i32.and $push63=, $pop62, $33
-; NO-SIMD128-NEXT:    i32.or $push64=, $pop61, $pop63
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop64
+; NO-SIMD128-NEXT:    i32.xor $push0=, $32, $48
+; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $16
+; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $48
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $31, $47
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $15
+; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $47
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $30, $46
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $14
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $46
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $29, $45
+; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $13
+; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $45
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop11
+; NO-SIMD128-NEXT:    i32.xor $push12=, $28, $44
+; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $12
+; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $44
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push15=, $27, $43
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $11
+; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $43
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop17
+; NO-SIMD128-NEXT:    i32.xor $push18=, $26, $42
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $10
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $42
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push21=, $25, $41
+; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $9
+; NO-SIMD128-NEXT:    i32.xor $push23=, $pop22, $41
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop23
+; NO-SIMD128-NEXT:    i32.xor $push24=, $24, $40
+; NO-SIMD128-NEXT:    i32.and $push25=, $pop24, $8
+; NO-SIMD128-NEXT:    i32.xor $push26=, $pop25, $40
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop26
+; NO-SIMD128-NEXT:    i32.xor $push27=, $23, $39
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $7
+; NO-SIMD128-NEXT:    i32.xor $push29=, $pop28, $39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop29
+; NO-SIMD128-NEXT:    i32.xor $push30=, $22, $38
+; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $6
+; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $38
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop32
+; NO-SIMD128-NEXT:    i32.xor $push33=, $21, $37
+; NO-SIMD128-NEXT:    i32.and $push34=, $pop33, $5
+; NO-SIMD128-NEXT:    i32.xor $push35=, $pop34, $37
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop35
+; NO-SIMD128-NEXT:    i32.xor $push36=, $20, $36
+; NO-SIMD128-NEXT:    i32.and $push37=, $pop36, $4
+; NO-SIMD128-NEXT:    i32.xor $push38=, $pop37, $36
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop38
+; NO-SIMD128-NEXT:    i32.xor $push39=, $19, $35
+; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $3
+; NO-SIMD128-NEXT:    i32.xor $push41=, $pop40, $35
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop41
+; NO-SIMD128-NEXT:    i32.xor $push42=, $18, $34
+; NO-SIMD128-NEXT:    i32.and $push43=, $pop42, $2
+; NO-SIMD128-NEXT:    i32.xor $push44=, $pop43, $34
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop44
+; NO-SIMD128-NEXT:    i32.xor $push45=, $17, $33
+; NO-SIMD128-NEXT:    i32.and $push46=, $pop45, $1
+; NO-SIMD128-NEXT:    i32.xor $push47=, $pop46, $33
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop47
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v16i8:
 ; NO-SIMD128-FAST:         .functype bitselect_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
-; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $1, $17
-; NO-SIMD128-FAST-NEXT:    i32.const $push1=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop1
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $pop2, $33
-; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $18
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $34
-; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop5, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $3, $19
-; NO-SIMD128-FAST-NEXT:    i32.const $push78=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop78
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $35
-; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop9, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.const $push77=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $36
-; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.const $push76=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $5, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $37
-; NO-SIMD128-FAST-NEXT:    i32.or $push20=, $pop17, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.const $push75=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $6, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $38
-; NO-SIMD128-FAST-NEXT:    i32.or $push24=, $pop21, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop24
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.const $push74=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $7, $pop74
-; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $pop26, $39
-; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $pop25, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $8, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $40
-; NO-SIMD128-FAST-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.const $push72=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $9, $pop72
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $pop34, $41
-; NO-SIMD128-FAST-NEXT:    i32.or $push36=, $pop33, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop36
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.const $push71=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $10, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.and $push39=, $pop38, $42
-; NO-SIMD128-FAST-NEXT:    i32.or $push40=, $pop37, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop40
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.const $push70=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $11, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $43
-; NO-SIMD128-FAST-NEXT:    i32.or $push44=, $pop41, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.const $push69=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $12, $pop69
-; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $pop46, $44
-; NO-SIMD128-FAST-NEXT:    i32.or $push48=, $pop45, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.const $push68=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push50=, $13, $pop68
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $pop50, $45
-; NO-SIMD128-FAST-NEXT:    i32.or $push52=, $pop49, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop52
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $14, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.and $push55=, $pop54, $46
-; NO-SIMD128-FAST-NEXT:    i32.or $push56=, $pop53, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop56
-; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $15, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.and $push59=, $pop58, $47
-; NO-SIMD128-FAST-NEXT:    i32.or $push60=, $pop57, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.and $push61=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.const $push65=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push62=, $16, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $pop62, $48
-; NO-SIMD128-FAST-NEXT:    i32.or $push64=, $pop61, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop64
+; NO-SIMD128-FAST-NEXT:    i32.xor $push0=, $17, $33
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $pop0, $1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $pop1, $33
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $18, $34
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop3, $2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $pop4, $34
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $19, $35
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $35
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $20, $36
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $36
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $21, $37
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $pop13, $37
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $22, $38
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $pop16, $38
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $23, $39
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $7
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $39
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $24, $40
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $pop22, $40
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $25, $41
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $9
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $41
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.xor $push27=, $26, $42
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $10
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $pop28, $42
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $27, $43
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $43
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $28, $44
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $12
+; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $pop34, $44
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $29, $45
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $pop37, $45
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.xor $push39=, $30, $46
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $pop40, $46
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $31, $47
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $15
+; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $47
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $32, $48
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $pop45, $16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push47=, $pop46, $48
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop47
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <16 x i8> %c, %v1
   %inv_mask = xor <16 x i8> %c,
@@ -7546,107 +7482,75 @@ define <8 x i16> @bitselect_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v8i16:
 ; NO-SIMD128:         .functype bitselect_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.and $push0=, $16, $8
-; NO-SIMD128-NEXT:    i32.const $push1=, -1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $pop1
-; NO-SIMD128-NEXT:    i32.and $push3=, $24, $pop2
-; NO-SIMD128-NEXT:    i32.or $push4=, $pop0, $pop3
-; NO-SIMD128-NEXT:    i32.store16 14($0), $pop4
-; NO-SIMD128-NEXT:    i32.and $push5=, $15, $7
-; NO-SIMD128-NEXT:    i32.const $push39=, -1
-; NO-SIMD128-NEXT:    i32.xor $push6=, $7, $pop39
-; NO-SIMD128-NEXT:    i32.and $push7=, $23, $pop6
-; NO-SIMD128-NEXT:    i32.or $push8=, $pop5, $pop7
-; NO-SIMD128-NEXT:    i32.store16 12($0), $pop8
-; NO-SIMD128-NEXT:    i32.and $push9=, $14, $6
-; NO-SIMD128-NEXT:    i32.const $push38=, -1
-; NO-SIMD128-NEXT:    i32.xor $push10=, $6, $pop38
-; NO-SIMD128-NEXT:    i32.and $push11=, $22, $pop10
-; NO-SIMD128-NEXT:    i32.or $push12=, $pop9, $pop11
-; NO-SIMD128-NEXT:    i32.store16 10($0), $pop12
-; NO-SIMD128-NEXT:    i32.and $push13=, $13, $5
-; NO-SIMD128-NEXT:    i32.const $push37=, -1
-; NO-SIMD128-NEXT:    i32.xor $push14=, $5, $pop37
-; NO-SIMD128-NEXT:    i32.and $push15=, $21, $pop14
-; NO-SIMD128-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop16
-; NO-SIMD128-NEXT:    i32.and $push17=, $12, $4
-; NO-SIMD128-NEXT:    i32.const $push36=, -1
-; NO-SIMD128-NEXT:    i32.xor $push18=, $4, $pop36
-; NO-SIMD128-NEXT:    i32.and $push19=, $20, $pop18
-; NO-SIMD128-NEXT:    i32.or $push20=, $pop17, $pop19
-; NO-SIMD128-NEXT:    i32.store16 6($0), $pop20
-; NO-SIMD128-NEXT:    i32.and $push21=, $11, $3
-; NO-SIMD128-NEXT:    i32.const $push35=, -1
-; NO-SIMD128-NEXT:    i32.xor $push22=, $3, $pop35
-; NO-SIMD128-NEXT:    i32.and $push23=, $19, $pop22
-; NO-SIMD128-NEXT:    i32.or $push24=, $pop21, $pop23
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop24
-; NO-SIMD128-NEXT:    i32.and $push25=, $10, $2
-; NO-SIMD128-NEXT:    i32.const $push34=, -1
-; NO-SIMD128-NEXT:    i32.xor $push26=, $2, $pop34
-; NO-SIMD128-NEXT:    i32.and $push27=, $18, $pop26
-; NO-SIMD128-NEXT:    i32.or $push28=, $pop25, $pop27
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
-; NO-SIMD128-NEXT:    i32.and $push29=, $9, $1
-; NO-SIMD128-NEXT:    i32.const $push33=, -1
-; NO-SIMD128-NEXT:    i32.xor $push30=, $1, $pop33
-; NO-SIMD128-NEXT:    i32.and $push31=, $17, $pop30
-; NO-SIMD128-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
+; NO-SIMD128-NEXT:    i32.xor $push0=, $16, $24
+; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $8
+; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $24
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $15, $23
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $7
+; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $23
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $14, $22
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $6
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $22
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $13, $21
+; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $5
+; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $21
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop11
+; NO-SIMD128-NEXT:    i32.xor $push12=, $12, $20
+; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $4
+; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $20
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push15=, $11, $19
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $3
+; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $19
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop17
+; NO-SIMD128-NEXT:    i32.xor $push18=, $10, $18
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $2
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $18
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push21=, $9, $17
+; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $1
+; NO-SIMD128-NEXT:    i32.xor $push23=, $pop22, $17
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop23
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v8i16:
 ; NO-SIMD128-FAST:         .functype bitselect_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
-; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $9, $1
-; NO-SIMD128-FAST-NEXT:    i32.const $push1=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop1
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $17, $pop2
-; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $18, $pop6
-; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop5, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $19, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop9, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push37=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $20, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $13, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $5, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $21, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.or $push20=, $pop17, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $14, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $6, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $22, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.or $push24=, $pop21, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop24
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $15, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push34=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $7, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $23, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $pop25, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $16, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $8, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $24, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.xor $push0=, $9, $17
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $pop0, $1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $pop1, $17
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $10, $18
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop3, $2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $pop4, $18
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $11, $19
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $19
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $12, $20
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $20
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $13, $21
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $pop13, $21
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $14, $22
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $pop16, $22
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $15, $23
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $7
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $23
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $16, $24
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $pop22, $24
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop23
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <8 x i16> %v1, %c
   %inv_mask = xor <8 x i16>
@@ -9453,59 +9357,43 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v4i32:
 ; NO-SIMD128:         .functype bitselect_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push1=, -1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $4, $pop1
-; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $12
-; NO-SIMD128-NEXT:    i32.and $push0=, $4, $8
-; NO-SIMD128-NEXT:    i32.or $push4=, $pop3, $pop0
-; NO-SIMD128-NEXT:    i32.store 12($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-NEXT:    i32.xor $push6=, $3, $pop19
-; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $11
-; NO-SIMD128-NEXT:    i32.and $push5=, $3, $7
-; NO-SIMD128-NEXT:    i32.or $push8=, $pop7, $pop5
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push18=, -1
-; NO-SIMD128-NEXT:    i32.xor $push10=, $2, $pop18
-; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $10
-; NO-SIMD128-NEXT:    i32.and $push9=, $2, $6
-; NO-SIMD128-NEXT:    i32.or $push12=, $pop11, $pop9
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push17=, -1
-; NO-SIMD128-NEXT:    i32.xor $push14=, $1, $pop17
-; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $9
-; NO-SIMD128-NEXT:    i32.and $push13=, $1, $5
-; NO-SIMD128-NEXT:    i32.or $push16=, $pop15, $pop13
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop16
+; NO-SIMD128-NEXT:    i32.xor $push0=, $8, $12
+; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $4
+; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $12
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $7, $11
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $3
+; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $11
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $6, $10
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $2
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $10
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $5, $9
+; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $1
+; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $9
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop11
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v4i32:
 ; NO-SIMD128-FAST:         .functype bitselect_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
-; NO-SIMD128-FAST-NEXT:    i32.const $push1=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop1
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $pop2, $9
-; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $1, $5
-; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop3, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $10
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $6
-; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop7, $pop5
-; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $11
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $3, $7
-; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop11, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $12
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop15, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push0=, $5, $9
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $pop0, $1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $pop1, $9
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $6, $10
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop3, $2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $pop4, $10
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $7, $11
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $11
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $8, $12
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $12
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop11
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <4 x i32> %c, %v1
   %inv_mask = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %c
@@ -10974,35 +10862,27 @@ define <2 x i64> @bitselect_v2i64(<2 x i64> %c, <2 x i64> %v1, <2 x i64> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v2i64:
 ; NO-SIMD128:         .functype bitselect_v2i64 (i32, i64, i64, i64, i64, i64, i64) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i64.const $push1=, -1
-; NO-SIMD128-NEXT:    i64.xor $push2=, $2, $pop1
-; NO-SIMD128-NEXT:    i64.and $push3=, $6, $pop2
-; NO-SIMD128-NEXT:    i64.and $push0=, $4, $2
-; NO-SIMD128-NEXT:    i64.or $push4=, $pop3, $pop0
-; NO-SIMD128-NEXT:    i64.store 8($0), $pop4
-; NO-SIMD128-NEXT:    i64.const $push9=, -1
-; NO-SIMD128-NEXT:    i64.xor $push6=, $1, $pop9
-; NO-SIMD128-NEXT:    i64.and $push7=, $5, $pop6
-; NO-SIMD128-NEXT:    i64.and $push5=, $3, $1
-; NO-SIMD128-NEXT:    i64.or $push8=, $pop7, $pop5
-; NO-SIMD128-NEXT:    i64.store 0($0), $pop8
+; NO-SIMD128-NEXT:    i64.xor $push0=, $4, $6
+; NO-SIMD128-NEXT:    i64.and $push1=, $pop0, $2
+; NO-SIMD128-NEXT:    i64.xor $push2=, $pop1, $6
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i64.xor $push3=, $3, $5
+; NO-SIMD128-NEXT:    i64.and $push4=, $pop3, $1
+; NO-SIMD128-NEXT:    i64.xor $push5=, $pop4, $5
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop5
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v2i64:
 ; NO-SIMD128-FAST:         .functype bitselect_v2i64 (i32, i64, i64, i64, i64, i64, i64) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
-; NO-SIMD128-FAST-NEXT:    i64.const $push1=, -1
-; NO-SIMD128-FAST-NEXT:    i64.xor $push2=, $1, $pop1
-; NO-SIMD128-FAST-NEXT:    i64.and $push3=, $5, $pop2
-; NO-SIMD128-FAST-NEXT:    i64.and $push0=, $3, $1
-; NO-SIMD128-FAST-NEXT:    i64.or $push4=, $pop3, $pop0
-; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i64.const $push9=, -1
-; NO-SIMD128-FAST-NEXT:    i64.xor $push6=, $2, $pop9
-; NO-SIMD128-FAST-NEXT:    i64.and $push7=, $6, $pop6
-; NO-SIMD128-FAST-NEXT:    i64.and $push5=, $4, $2
-; NO-SIMD128-FAST-NEXT:    i64.or $push8=, $pop7, $pop5
-; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i64.xor $push0=, $3, $5
+; NO-SIMD128-FAST-NEXT:    i64.and $push1=, $pop0, $1
+; NO-SIMD128-FAST-NEXT:    i64.xor $push2=, $pop1, $5
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i64.xor $push3=, $4, $6
+; NO-SIMD128-FAST-NEXT:    i64.and $push4=, $pop3, $2
+; NO-SIMD128-FAST-NEXT:    i64.xor $push5=, $pop4, $6
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <2 x i64> %v1, %c
   %inv_mask = xor <2 x i64> <i64 -1, i64 -1>, %c
diff --git a/llvm/test/CodeGen/X86/avx512-scalar_mask.ll b/llvm/test/CodeGen/X86/avx512-scalar_mask.ll
index 9e9fc57..fc2bf88 100644
--- a/llvm/test/CodeGen/X86/avx512-scalar_mask.ll
+++ b/llvm/test/CodeGen/X86/avx512-scalar_mask.ll
@@ -24,7 +24,7 @@ define <4 x float>@test_var_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float>
   ret < 4 x float> %res
 }
 
-; FIXME: we should just return %xmm0 here.
+; just return %xmm0 here.
 define <4 x float>@test_const0_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
 ; CHECK-LABEL: test_const0_mask:
 ; CHECK:       ## %bb.0:
@@ -33,7 +33,7 @@ define <4 x float>@test_const0_mask(<4 x float> %v0, <4 x float> %v1, <4 x float
   ret < 4 x float> %res
 }
 
-; FIXME: we should zero the lower element of xmm0 and return it.
+; zero the lower element of xmm0 and return it.
 define <4 x float>@test_const0_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
 ; CHECK-LABEL: test_const0_maskz:
 ; CHECK:       ## %bb.0:
@@ -44,7 +44,7 @@ define <4 x float>@test_const0_maskz(<4 x float> %v0, <4 x float> %v1, <4 x floa
   ret < 4 x float> %res
 }
 
-; FIXME: we should just return %xmm0 here.
+; just return %xmm0 here.
 define <4 x float>@test_const2_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
 ; CHECK-LABEL: test_const2_mask:
 ; CHECK:       ## %bb.0:
@@ -53,7 +53,7 @@ define <4 x float>@test_const2_mask(<4 x float> %v0, <4 x float> %v1, <4 x float
   ret < 4 x float> %res
 }
 
-; FIXME: we should zero the lower element of xmm0 and return it.
+; zero the lower element of xmm0 and return it.
 define <4 x float>@test_const2_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
 ; CHECK-LABEL: test_const2_maskz:
 ; CHECK:       ## %bb.0:
diff --git a/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll b/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll
index 1d535f9..eac803f 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs --show-mc-encoding -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,avx512vl    | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -verify-machineinstrs --show-mc-encoding -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,avx512vl    | FileCheck %s --check-prefixes=CHECK,HasVL
+; RUN: llc < %s -verify-machineinstrs --show-mc-encoding -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16    | FileCheck %s --check-prefixes=CHECK,NOVL
 
 declare half @llvm.maxnum.f16(half, half)
 declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>)
@@ -9,61 +10,397 @@ declare <16 x half> @llvm.maxnum.v16f16(<16 x half>, <16 x half>)
 declare <32 x half> @llvm.maxnum.v32f16(<32 x half>, <32 x half>)
 
 define half @test_intrinsic_fmaxh(half %x, half %y) {
-; CHECK-LABEL: test_intrinsic_fmaxh:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmaxsh %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xd0]
-; CHECK-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovsh %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd1]
-; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fmaxh:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xd0]
+; HasVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd1]
+; HasVL-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fmaxh:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xd0]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd1]
+; NOVL-NEXT:    vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %z = call half @llvm.maxnum.f16(half %x, half %y) readnone
   ret half %z
 }
 
 define <2 x half> @test_intrinsic_fmax_v2f16(<2 x half> %x, <2 x half> %y) {
-; CHECK-LABEL: test_intrinsic_fmax_v2f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
-; CHECK-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fmax_v2f16:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
+; HasVL-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
+; HasVL-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fmax_v2f16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vpsrldq $14, %xmm0, %xmm2 # encoding: [0xc5,0xe9,0x73,0xd8,0x0e]
+; NOVL-NEXT:    # xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $14, %xmm1, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd9,0x0e]
+; NOVL-NEXT:    # xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vmaxsh %xmm2, %xmm3, %xmm4 # encoding: [0x62,0xf5,0x66,0x08,0x5f,0xe2]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe3]
+; NOVL-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm2 # encoding: [0xc5,0xf8,0xc6,0xd0,0xff]
+; NOVL-NEXT:    # xmm2 = xmm0[3,3,3,3]
+; NOVL-NEXT:    vpshufd $255, %xmm1, %xmm3 # encoding: [0xc5,0xf9,0x70,0xd9,0xff]
+; NOVL-NEXT:    # xmm3 = xmm1[3,3,3,3]
+; NOVL-NEXT:    vmaxsh %xmm2, %xmm3, %xmm5 # encoding: [0x62,0xf5,0x66,0x08,0x5f,0xea]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xeb]
+; NOVL-NEXT:    vpunpcklwd %xmm4, %xmm5, %xmm2 # encoding: [0xc5,0xd1,0x61,0xd4]
+; NOVL-NEXT:    # xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; NOVL-NEXT:    vpsrldq $10, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd8,0x0a]
+; NOVL-NEXT:    # xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $10, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd9,0x0a]
+; NOVL-NEXT:    # xmm4 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm5 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xeb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xec]
+; NOVL-NEXT:    vshufpd $1, %xmm0, %xmm0, %xmm3 # encoding: [0xc5,0xf9,0xc6,0xd8,0x01]
+; NOVL-NEXT:    # xmm3 = xmm0[1,0]
+; NOVL-NEXT:    vshufpd $1, %xmm1, %xmm1, %xmm4 # encoding: [0xc5,0xf1,0xc6,0xe1,0x01]
+; NOVL-NEXT:    # xmm4 = xmm1[1,0]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm6 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf4]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm6, %xmm3 # encoding: [0xc5,0xc9,0x61,0xdd]
+; NOVL-NEXT:    # xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; NOVL-NEXT:    vpunpckldq %xmm2, %xmm3, %xmm2 # encoding: [0xc5,0xe1,0x62,0xd2]
+; NOVL-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd0,0x30]
+; NOVL-NEXT:    vpsrlq $48, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd1,0x30]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm5 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xeb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xec]
+; NOVL-NEXT:    vmovshdup %xmm0, %xmm3 # encoding: [0xc5,0xfa,0x16,0xd8]
+; NOVL-NEXT:    # xmm3 = xmm0[1,1,3,3]
+; NOVL-NEXT:    vmovshdup %xmm1, %xmm4 # encoding: [0xc5,0xfa,0x16,0xe1]
+; NOVL-NEXT:    # xmm4 = xmm1[1,1,3,3]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm6 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf4]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm6, %xmm3 # encoding: [0xc5,0xc9,0x61,0xdd]
+; NOVL-NEXT:    # xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; NOVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm4 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xe0]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe1]
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xd0,0x10]
+; NOVL-NEXT:    vpsrld $16, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x72,0xd1,0x10]
+; NOVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm5 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xe8]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe9]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm4, %xmm0 # encoding: [0xc5,0xd9,0x61,0xc5]
+; NOVL-NEXT:    # xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; NOVL-NEXT:    vpunpckldq %xmm3, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x62,0xc3]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; NOVL-NEXT:    vpunpcklqdq %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x6c,0xc2]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm2[0]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %z = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %x, <2 x half> %y) readnone
   ret <2 x half> %z
 }
 
 define <4 x half> @test_intrinsic_fmax_v4f16(<4 x half> %x, <4 x half> %y) {
-; CHECK-LABEL: test_intrinsic_fmax_v4f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
-; CHECK-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fmax_v4f16:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
+; HasVL-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
+; HasVL-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fmax_v4f16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vpsrldq $14, %xmm0, %xmm2 # encoding: [0xc5,0xe9,0x73,0xd8,0x0e]
+; NOVL-NEXT:    # xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $14, %xmm1, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd9,0x0e]
+; NOVL-NEXT:    # xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vmaxsh %xmm2, %xmm3, %xmm4 # encoding: [0x62,0xf5,0x66,0x08,0x5f,0xe2]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe3]
+; NOVL-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm2 # encoding: [0xc5,0xf8,0xc6,0xd0,0xff]
+; NOVL-NEXT:    # xmm2 = xmm0[3,3,3,3]
+; NOVL-NEXT:    vpshufd $255, %xmm1, %xmm3 # encoding: [0xc5,0xf9,0x70,0xd9,0xff]
+; NOVL-NEXT:    # xmm3 = xmm1[3,3,3,3]
+; NOVL-NEXT:    vmaxsh %xmm2, %xmm3, %xmm5 # encoding: [0x62,0xf5,0x66,0x08,0x5f,0xea]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xeb]
+; NOVL-NEXT:    vpunpcklwd %xmm4, %xmm5, %xmm2 # encoding: [0xc5,0xd1,0x61,0xd4]
+; NOVL-NEXT:    # xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; NOVL-NEXT:    vpsrldq $10, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd8,0x0a]
+; NOVL-NEXT:    # xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $10, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd9,0x0a]
+; NOVL-NEXT:    # xmm4 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm5 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xeb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xec]
+; NOVL-NEXT:    vshufpd $1, %xmm0, %xmm0, %xmm3 # encoding: [0xc5,0xf9,0xc6,0xd8,0x01]
+; NOVL-NEXT:    # xmm3 = xmm0[1,0]
+; NOVL-NEXT:    vshufpd $1, %xmm1, %xmm1, %xmm4 # encoding: [0xc5,0xf1,0xc6,0xe1,0x01]
+; NOVL-NEXT:    # xmm4 = xmm1[1,0]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm6 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf4]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm6, %xmm3 # encoding: [0xc5,0xc9,0x61,0xdd]
+; NOVL-NEXT:    # xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; NOVL-NEXT:    vpunpckldq %xmm2, %xmm3, %xmm2 # encoding: [0xc5,0xe1,0x62,0xd2]
+; NOVL-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd0,0x30]
+; NOVL-NEXT:    vpsrlq $48, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd1,0x30]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm5 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xeb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xec]
+; NOVL-NEXT:    vmovshdup %xmm0, %xmm3 # encoding: [0xc5,0xfa,0x16,0xd8]
+; NOVL-NEXT:    # xmm3 = xmm0[1,1,3,3]
+; NOVL-NEXT:    vmovshdup %xmm1, %xmm4 # encoding: [0xc5,0xfa,0x16,0xe1]
+; NOVL-NEXT:    # xmm4 = xmm1[1,1,3,3]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm6 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf4]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm6, %xmm3 # encoding: [0xc5,0xc9,0x61,0xdd]
+; NOVL-NEXT:    # xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; NOVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm4 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xe0]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe1]
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xd0,0x10]
+; NOVL-NEXT:    vpsrld $16, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x72,0xd1,0x10]
+; NOVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm5 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xe8]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe9]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm4, %xmm0 # encoding: [0xc5,0xd9,0x61,0xc5]
+; NOVL-NEXT:    # xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; NOVL-NEXT:    vpunpckldq %xmm3, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x62,0xc3]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; NOVL-NEXT:    vpunpcklqdq %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x6c,0xc2]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm2[0]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %z = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %x, <4 x half> %y) readnone
   ret <4 x half> %z
 }
 
 define <8 x half> @test_intrinsic_fmax_v8f16(<8 x half> %x, <8 x half> %y) {
-; CHECK-LABEL: test_intrinsic_fmax_v8f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
-; CHECK-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fmax_v8f16:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
+; HasVL-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
+; HasVL-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fmax_v8f16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vpsrldq $14, %xmm0, %xmm2 # encoding: [0xc5,0xe9,0x73,0xd8,0x0e]
+; NOVL-NEXT:    # xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $14, %xmm1, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd9,0x0e]
+; NOVL-NEXT:    # xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vmaxsh %xmm2, %xmm3, %xmm4 # encoding: [0x62,0xf5,0x66,0x08,0x5f,0xe2]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe3]
+; NOVL-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm2 # encoding: [0xc5,0xf8,0xc6,0xd0,0xff]
+; NOVL-NEXT:    # xmm2 = xmm0[3,3,3,3]
+; NOVL-NEXT:    vpshufd $255, %xmm1, %xmm3 # encoding: [0xc5,0xf9,0x70,0xd9,0xff]
+; NOVL-NEXT:    # xmm3 = xmm1[3,3,3,3]
+; NOVL-NEXT:    vmaxsh %xmm2, %xmm3, %xmm5 # encoding: [0x62,0xf5,0x66,0x08,0x5f,0xea]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xeb]
+; NOVL-NEXT:    vpunpcklwd %xmm4, %xmm5, %xmm2 # encoding: [0xc5,0xd1,0x61,0xd4]
+; NOVL-NEXT:    # xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; NOVL-NEXT:    vpsrldq $10, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd8,0x0a]
+; NOVL-NEXT:    # xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $10, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd9,0x0a]
+; NOVL-NEXT:    # xmm4 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm5 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xeb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xec]
+; NOVL-NEXT:    vshufpd $1, %xmm0, %xmm0, %xmm3 # encoding: [0xc5,0xf9,0xc6,0xd8,0x01]
+; NOVL-NEXT:    # xmm3 = xmm0[1,0]
+; NOVL-NEXT:    vshufpd $1, %xmm1, %xmm1, %xmm4 # encoding: [0xc5,0xf1,0xc6,0xe1,0x01]
+; NOVL-NEXT:    # xmm4 = xmm1[1,0]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm6 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf4]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm6, %xmm3 # encoding: [0xc5,0xc9,0x61,0xdd]
+; NOVL-NEXT:    # xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; NOVL-NEXT:    vpunpckldq %xmm2, %xmm3, %xmm2 # encoding: [0xc5,0xe1,0x62,0xd2]
+; NOVL-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd0,0x30]
+; NOVL-NEXT:    vpsrlq $48, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd1,0x30]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm5 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xeb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xec]
+; NOVL-NEXT:    vmovshdup %xmm0, %xmm3 # encoding: [0xc5,0xfa,0x16,0xd8]
+; NOVL-NEXT:    # xmm3 = xmm0[1,1,3,3]
+; NOVL-NEXT:    vmovshdup %xmm1, %xmm4 # encoding: [0xc5,0xfa,0x16,0xe1]
+; NOVL-NEXT:    # xmm4 = xmm1[1,1,3,3]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm6 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf4]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm6, %xmm3 # encoding: [0xc5,0xc9,0x61,0xdd]
+; NOVL-NEXT:    # xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; NOVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm4 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xe0]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe1]
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xd0,0x10]
+; NOVL-NEXT:    vpsrld $16, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x72,0xd1,0x10]
+; NOVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm5 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xe8]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe9]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm4, %xmm0 # encoding: [0xc5,0xd9,0x61,0xc5]
+; NOVL-NEXT:    # xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; NOVL-NEXT:    vpunpckldq %xmm3, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x62,0xc3]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; NOVL-NEXT:    vpunpcklqdq %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x6c,0xc2]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm2[0]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %z = call <8 x half> @llvm.maxnum.v8f16(<8 x half> %x, <8 x half> %y) readnone
   ret <8 x half> %z
 }
 
 define <16 x half> @test_intrinsic_fmax_v16f16(<16 x half> %x, <16 x half> %y) {
-; CHECK-LABEL: test_intrinsic_fmax_v16f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmaxph %ymm0, %ymm1, %ymm2 # encoding: [0x62,0xf5,0x74,0x28,0x5f,0xd0]
-; CHECK-NEXT:    vcmpunordph %ymm0, %ymm0, %k1 # encoding: [0x62,0xf3,0x7c,0x28,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovdqu16 %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0xd1]
-; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fmax_v16f16:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vmaxph %ymm0, %ymm1, %ymm2 # encoding: [0x62,0xf5,0x74,0x28,0x5f,0xd0]
+; HasVL-NEXT:    vcmpunordph %ymm0, %ymm0, %k1 # encoding: [0x62,0xf3,0x7c,0x28,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovdqu16 %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0xd1]
+; HasVL-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fmax_v16f16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vextracti128 $1, %ymm0, %xmm2 # encoding: [0xc4,0xe3,0x7d,0x39,0xc2,0x01]
+; NOVL-NEXT:    vpsrldq $14, %xmm2, %xmm4 # encoding: [0xc5,0xd9,0x73,0xda,0x0e]
+; NOVL-NEXT:    # xmm4 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vextracti128 $1, %ymm1, %xmm3 # encoding: [0xc4,0xe3,0x7d,0x39,0xcb,0x01]
+; NOVL-NEXT:    vpsrldq $14, %xmm3, %xmm5 # encoding: [0xc5,0xd1,0x73,0xdb,0x0e]
+; NOVL-NEXT:    # xmm5 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vmaxsh %xmm4, %xmm5, %xmm6 # encoding: [0x62,0xf5,0x56,0x08,0x5f,0xf4]
+; NOVL-NEXT:    vcmpunordsh %xmm4, %xmm4, %k1 # encoding: [0x62,0xf3,0x5e,0x08,0xc2,0xcc,0x03]
+; NOVL-NEXT:    vmovsh %xmm5, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf5]
+; NOVL-NEXT:    vpshufd $255, %xmm2, %xmm4 # encoding: [0xc5,0xf9,0x70,0xe2,0xff]
+; NOVL-NEXT:    # xmm4 = xmm2[3,3,3,3]
+; NOVL-NEXT:    vpshufd $255, %xmm3, %xmm5 # encoding: [0xc5,0xf9,0x70,0xeb,0xff]
+; NOVL-NEXT:    # xmm5 = xmm3[3,3,3,3]
+; NOVL-NEXT:    vmaxsh %xmm4, %xmm5, %xmm7 # encoding: [0x62,0xf5,0x56,0x08,0x5f,0xfc]
+; NOVL-NEXT:    vcmpunordsh %xmm4, %xmm4, %k1 # encoding: [0x62,0xf3,0x5e,0x08,0xc2,0xcc,0x03]
+; NOVL-NEXT:    vmovsh %xmm5, %xmm0, %xmm7 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xfd]
+; NOVL-NEXT:    vpunpcklwd %xmm6, %xmm7, %xmm4 # encoding: [0xc5,0xc1,0x61,0xe6]
+; NOVL-NEXT:    # xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; NOVL-NEXT:    vpsrldq $10, %xmm2, %xmm5 # encoding: [0xc5,0xd1,0x73,0xda,0x0a]
+; NOVL-NEXT:    # xmm5 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $10, %xmm3, %xmm6 # encoding: [0xc5,0xc9,0x73,0xdb,0x0a]
+; NOVL-NEXT:    # xmm6 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vmaxsh %xmm5, %xmm6, %xmm7 # encoding: [0x62,0xf5,0x4e,0x08,0x5f,0xfd]
+; NOVL-NEXT:    vcmpunordsh %xmm5, %xmm5, %k1 # encoding: [0x62,0xf3,0x56,0x08,0xc2,0xcd,0x03]
+; NOVL-NEXT:    vmovsh %xmm6, %xmm0, %xmm7 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xfe]
+; NOVL-NEXT:    vshufpd $1, %xmm2, %xmm2, %xmm5 # encoding: [0xc5,0xe9,0xc6,0xea,0x01]
+; NOVL-NEXT:    # xmm5 = xmm2[1,0]
+; NOVL-NEXT:    vshufpd $1, %xmm3, %xmm3, %xmm6 # encoding: [0xc5,0xe1,0xc6,0xf3,0x01]
+; NOVL-NEXT:    # xmm6 = xmm3[1,0]
+; NOVL-NEXT:    vmaxsh %xmm5, %xmm6, %xmm8 # encoding: [0x62,0x75,0x4e,0x08,0x5f,0xc5]
+; NOVL-NEXT:    vcmpunordsh %xmm5, %xmm5, %k1 # encoding: [0x62,0xf3,0x56,0x08,0xc2,0xcd,0x03]
+; NOVL-NEXT:    vmovsh %xmm6, %xmm0, %xmm8 {%k1} # encoding: [0x62,0x75,0x7e,0x09,0x10,0xc6]
+; NOVL-NEXT:    vpunpcklwd %xmm7, %xmm8, %xmm5 # encoding: [0xc5,0xb9,0x61,0xef]
+; NOVL-NEXT:    # xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; NOVL-NEXT:    vpunpckldq %xmm4, %xmm5, %xmm4 # encoding: [0xc5,0xd1,0x62,0xe4]
+; NOVL-NEXT:    # xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; NOVL-NEXT:    vpsrldq $14, %xmm0, %xmm5 # encoding: [0xc5,0xd1,0x73,0xd8,0x0e]
+; NOVL-NEXT:    # xmm5 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $14, %xmm1, %xmm6 # encoding: [0xc5,0xc9,0x73,0xd9,0x0e]
+; NOVL-NEXT:    # xmm6 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vmaxsh %xmm5, %xmm6, %xmm7 # encoding: [0x62,0xf5,0x4e,0x08,0x5f,0xfd]
+; NOVL-NEXT:    vcmpunordsh %xmm5, %xmm5, %k1 # encoding: [0x62,0xf3,0x56,0x08,0xc2,0xcd,0x03]
+; NOVL-NEXT:    vmovsh %xmm6, %xmm0, %xmm7 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xfe]
+; NOVL-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm5 # encoding: [0xc5,0xf8,0xc6,0xe8,0xff]
+; NOVL-NEXT:    # xmm5 = xmm0[3,3,3,3]
+; NOVL-NEXT:    vpshufd $255, %xmm1, %xmm6 # encoding: [0xc5,0xf9,0x70,0xf1,0xff]
+; NOVL-NEXT:    # xmm6 = xmm1[3,3,3,3]
+; NOVL-NEXT:    vmaxsh %xmm5, %xmm6, %xmm8 # encoding: [0x62,0x75,0x4e,0x08,0x5f,0xc5]
+; NOVL-NEXT:    vcmpunordsh %xmm5, %xmm5, %k1 # encoding: [0x62,0xf3,0x56,0x08,0xc2,0xcd,0x03]
+; NOVL-NEXT:    vmovsh %xmm6, %xmm0, %xmm8 {%k1} # encoding: [0x62,0x75,0x7e,0x09,0x10,0xc6]
+; NOVL-NEXT:    vpunpcklwd %xmm7, %xmm8, %xmm5 # encoding: [0xc5,0xb9,0x61,0xef]
+; NOVL-NEXT:    # xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; NOVL-NEXT:    vpsrldq $10, %xmm0, %xmm6 # encoding: [0xc5,0xc9,0x73,0xd8,0x0a]
+; NOVL-NEXT:    # xmm6 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $10, %xmm1, %xmm7 # encoding: [0xc5,0xc1,0x73,0xd9,0x0a]
+; NOVL-NEXT:    # xmm7 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vmaxsh %xmm6, %xmm7, %xmm8 # encoding: [0x62,0x75,0x46,0x08,0x5f,0xc6]
+; NOVL-NEXT:    vcmpunordsh %xmm6, %xmm6, %k1 # encoding: [0x62,0xf3,0x4e,0x08,0xc2,0xce,0x03]
+; NOVL-NEXT:    vmovsh %xmm7, %xmm0, %xmm8 {%k1} # encoding: [0x62,0x75,0x7e,0x09,0x10,0xc7]
+; NOVL-NEXT:    vshufpd $1, %xmm0, %xmm0, %xmm6 # encoding: [0xc5,0xf9,0xc6,0xf0,0x01]
+; NOVL-NEXT:    # xmm6 = xmm0[1,0]
+; NOVL-NEXT:    vshufpd $1, %xmm1, %xmm1, %xmm7 # encoding: [0xc5,0xf1,0xc6,0xf9,0x01]
+; NOVL-NEXT:    # xmm7 = xmm1[1,0]
+; NOVL-NEXT:    vmaxsh %xmm6, %xmm7, %xmm9 # encoding: [0x62,0x75,0x46,0x08,0x5f,0xce]
+; NOVL-NEXT:    vcmpunordsh %xmm6, %xmm6, %k1 # encoding: [0x62,0xf3,0x4e,0x08,0xc2,0xce,0x03]
+; NOVL-NEXT:    vmovsh %xmm7, %xmm0, %xmm9 {%k1} # encoding: [0x62,0x75,0x7e,0x09,0x10,0xcf]
+; NOVL-NEXT:    vpunpcklwd %xmm8, %xmm9, %xmm6 # encoding: [0xc4,0xc1,0x31,0x61,0xf0]
+; NOVL-NEXT:    # xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
+; NOVL-NEXT:    vpunpckldq %xmm5, %xmm6, %xmm5 # encoding: [0xc5,0xc9,0x62,0xed]
+; NOVL-NEXT:    # xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; NOVL-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4 # encoding: [0xc4,0xe3,0x55,0x38,0xe4,0x01]
+; NOVL-NEXT:    vpsrlq $48, %xmm2, %xmm5 # encoding: [0xc5,0xd1,0x73,0xd2,0x30]
+; NOVL-NEXT:    vpsrlq $48, %xmm3, %xmm6 # encoding: [0xc5,0xc9,0x73,0xd3,0x30]
+; NOVL-NEXT:    vmaxsh %xmm5, %xmm6, %xmm7 # encoding: [0x62,0xf5,0x4e,0x08,0x5f,0xfd]
+; NOVL-NEXT:    vcmpunordsh %xmm5, %xmm5, %k1 # encoding: [0x62,0xf3,0x56,0x08,0xc2,0xcd,0x03]
+; NOVL-NEXT:    vmovsh %xmm6, %xmm0, %xmm7 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xfe]
+; NOVL-NEXT:    vmovshdup %xmm2, %xmm5 # encoding: [0xc5,0xfa,0x16,0xea]
+; NOVL-NEXT:    # xmm5 = xmm2[1,1,3,3]
+; NOVL-NEXT:    vmovshdup %xmm3, %xmm6 # encoding: [0xc5,0xfa,0x16,0xf3]
+; NOVL-NEXT:    # xmm6 = xmm3[1,1,3,3]
+; NOVL-NEXT:    vmaxsh %xmm5, %xmm6, %xmm8 # encoding: [0x62,0x75,0x4e,0x08,0x5f,0xc5]
+; NOVL-NEXT:    vcmpunordsh %xmm5, %xmm5, %k1 # encoding: [0x62,0xf3,0x56,0x08,0xc2,0xcd,0x03]
+; NOVL-NEXT:    vmovsh %xmm6, %xmm0, %xmm8 {%k1} # encoding: [0x62,0x75,0x7e,0x09,0x10,0xc6]
+; NOVL-NEXT:    vpunpcklwd %xmm7, %xmm8, %xmm5 # encoding: [0xc5,0xb9,0x61,0xef]
+; NOVL-NEXT:    # xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; NOVL-NEXT:    vmaxsh %xmm2, %xmm3, %xmm6 # encoding: [0x62,0xf5,0x66,0x08,0x5f,0xf2]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf3]
+; NOVL-NEXT:    vpsrld $16, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0x72,0xd2,0x10]
+; NOVL-NEXT:    vpsrld $16, %xmm3, %xmm3 # encoding: [0xc5,0xe1,0x72,0xd3,0x10]
+; NOVL-NEXT:    vmaxsh %xmm2, %xmm3, %xmm7 # encoding: [0x62,0xf5,0x66,0x08,0x5f,0xfa]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm7 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xfb]
+; NOVL-NEXT:    vpunpcklwd %xmm7, %xmm6, %xmm2 # encoding: [0xc5,0xc9,0x61,0xd7]
+; NOVL-NEXT:    # xmm2 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; NOVL-NEXT:    vpunpckldq %xmm5, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0x62,0xd5]
+; NOVL-NEXT:    # xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd0,0x30]
+; NOVL-NEXT:    vpsrlq $48, %xmm1, %xmm5 # encoding: [0xc5,0xd1,0x73,0xd1,0x30]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm5, %xmm6 # encoding: [0x62,0xf5,0x56,0x08,0x5f,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm5, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf5]
+; NOVL-NEXT:    vmovshdup %xmm0, %xmm3 # encoding: [0xc5,0xfa,0x16,0xd8]
+; NOVL-NEXT:    # xmm3 = xmm0[1,1,3,3]
+; NOVL-NEXT:    vmovshdup %xmm1, %xmm5 # encoding: [0xc5,0xfa,0x16,0xe9]
+; NOVL-NEXT:    # xmm5 = xmm1[1,1,3,3]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm5, %xmm7 # encoding: [0x62,0xf5,0x56,0x08,0x5f,0xfb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm5, %xmm0, %xmm7 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xfd]
+; NOVL-NEXT:    vpunpcklwd %xmm6, %xmm7, %xmm3 # encoding: [0xc5,0xc1,0x61,0xde]
+; NOVL-NEXT:    # xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; NOVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm5 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xe8]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe9]
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xd0,0x10]
+; NOVL-NEXT:    vpsrld $16, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x72,0xd1,0x10]
+; NOVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm6 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xf0]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf1]
+; NOVL-NEXT:    vpunpcklwd %xmm6, %xmm5, %xmm0 # encoding: [0xc5,0xd1,0x61,0xc6]
+; NOVL-NEXT:    # xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; NOVL-NEXT:    vpunpckldq %xmm3, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x62,0xc3]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; NOVL-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x38,0xc2,0x01]
+; NOVL-NEXT:    vpunpcklqdq %ymm4, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x6c,0xc4]
+; NOVL-NEXT:    # ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %z = call <16 x half> @llvm.maxnum.v16f16(<16 x half> %x, <16 x half> %y) readnone
   ret <16 x half> %z
 }
@@ -81,10 +418,68 @@ define <32 x half> @test_intrinsic_fmax_v32f16(<32 x half> %x, <32 x half> %y) {
 }
 
 define <4 x half> @maxnum_intrinsic_nnan_fmf_f432(<4 x half> %a, <4 x half> %b) {
-; CHECK-LABEL: maxnum_intrinsic_nnan_fmf_f432:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmaxph %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7c,0x08,0x5f,0xc1]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: maxnum_intrinsic_nnan_fmf_f432:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vmaxph %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7c,0x08,0x5f,0xc1]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: maxnum_intrinsic_nnan_fmf_f432:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vpsrldq $14, %xmm0, %xmm2 # encoding: [0xc5,0xe9,0x73,0xd8,0x0e]
+; NOVL-NEXT:    # xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $14, %xmm1, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd9,0x0e]
+; NOVL-NEXT:    # xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vcmpltsh %xmm2, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xca,0x01]
+; NOVL-NEXT:    vmovsh %xmm2, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xda]
+; NOVL-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm2 # encoding: [0xc5,0xf8,0xc6,0xd0,0xff]
+; NOVL-NEXT:    # xmm2 = xmm0[3,3,3,3]
+; NOVL-NEXT:    vpshufd $255, %xmm1, %xmm4 # encoding: [0xc5,0xf9,0x70,0xe1,0xff]
+; NOVL-NEXT:    # xmm4 = xmm1[3,3,3,3]
+; NOVL-NEXT:    vcmpltsh %xmm2, %xmm4, %k1 # encoding: [0x62,0xf3,0x5e,0x08,0xc2,0xca,0x01]
+; NOVL-NEXT:    vmovsh %xmm2, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe2]
+; NOVL-NEXT:    vpunpcklwd %xmm3, %xmm4, %xmm2 # encoding: [0xc5,0xd9,0x61,0xd3]
+; NOVL-NEXT:    # xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; NOVL-NEXT:    vpsrldq $10, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd8,0x0a]
+; NOVL-NEXT:    # xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $10, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd9,0x0a]
+; NOVL-NEXT:    # xmm4 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vcmpltsh %xmm3, %xmm4, %k1 # encoding: [0x62,0xf3,0x5e,0x08,0xc2,0xcb,0x01]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe3]
+; NOVL-NEXT:    vshufpd $1, %xmm0, %xmm0, %xmm3 # encoding: [0xc5,0xf9,0xc6,0xd8,0x01]
+; NOVL-NEXT:    # xmm3 = xmm0[1,0]
+; NOVL-NEXT:    vshufpd $1, %xmm1, %xmm1, %xmm5 # encoding: [0xc5,0xf1,0xc6,0xe9,0x01]
+; NOVL-NEXT:    # xmm5 = xmm1[1,0]
+; NOVL-NEXT:    vcmpltsh %xmm3, %xmm5, %k1 # encoding: [0x62,0xf3,0x56,0x08,0xc2,0xcb,0x01]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xeb]
+; NOVL-NEXT:    vpunpcklwd %xmm4, %xmm5, %xmm3 # encoding: [0xc5,0xd1,0x61,0xdc]
+; NOVL-NEXT:    # xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; NOVL-NEXT:    vpunpckldq %xmm2, %xmm3, %xmm2 # encoding: [0xc5,0xe1,0x62,0xd2]
+; NOVL-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd0,0x30]
+; NOVL-NEXT:    vpsrlq $48, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd1,0x30]
+; NOVL-NEXT:    vcmpltsh %xmm3, %xmm4, %k1 # encoding: [0x62,0xf3,0x5e,0x08,0xc2,0xcb,0x01]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe3]
+; NOVL-NEXT:    vmovshdup %xmm0, %xmm3 # encoding: [0xc5,0xfa,0x16,0xd8]
+; NOVL-NEXT:    # xmm3 = xmm0[1,1,3,3]
+; NOVL-NEXT:    vmovshdup %xmm1, %xmm5 # encoding: [0xc5,0xfa,0x16,0xe9]
+; NOVL-NEXT:    # xmm5 = xmm1[1,1,3,3]
+; NOVL-NEXT:    vcmpltsh %xmm3, %xmm5, %k1 # encoding: [0x62,0xf3,0x56,0x08,0xc2,0xcb,0x01]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xeb]
+; NOVL-NEXT:    vpunpcklwd %xmm4, %xmm5, %xmm3 # encoding: [0xc5,0xd1,0x61,0xdc]
+; NOVL-NEXT:    # xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; NOVL-NEXT:    vcmpltsh %xmm0, %xmm1, %k1 # encoding: [0x62,0xf3,0x76,0x08,0xc2,0xc8,0x01]
+; NOVL-NEXT:    vpsrld $16, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x72,0xd1,0x10]
+; NOVL-NEXT:    vmovsh %xmm0, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xc8]
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xd0,0x10]
+; NOVL-NEXT:    vcmpltsh %xmm0, %xmm4, %k1 # encoding: [0x62,0xf3,0x5e,0x08,0xc2,0xc8,0x01]
+; NOVL-NEXT:    vmovsh %xmm0, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe0]
+; NOVL-NEXT:    vpunpcklwd %xmm4, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0x61,0xc4]
+; NOVL-NEXT:    # xmm0 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; NOVL-NEXT:    vpunpckldq %xmm3, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x62,0xc3]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; NOVL-NEXT:    vpunpcklqdq %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x6c,0xc2]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm2[0]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %r = tail call nnan <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b)
   ret <4 x half> %r
 }
diff --git a/llvm/test/CodeGen/X86/avx512fp16-fminnum.ll b/llvm/test/CodeGen/X86/avx512fp16-fminnum.ll
index b81a6d5..05509e0 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-fminnum.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-fminnum.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs --show-mc-encoding -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl    | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -verify-machineinstrs --show-mc-encoding -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl    | FileCheck %s --check-prefixes=CHECK,HasVL
+; RUN: llc < %s -verify-machineinstrs --show-mc-encoding -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16    | FileCheck %s --check-prefixes=CHECK,NOVL
 
 declare half @llvm.minnum.f16(half, half)
 declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>)
@@ -9,61 +10,397 @@ declare <16 x half> @llvm.minnum.v16f16(<16 x half>, <16 x half>)
 declare <32 x half> @llvm.minnum.v32f16(<32 x half>, <32 x half>)
 
 define half @test_intrinsic_fminh(half %x, half %y) {
-; CHECK-LABEL: test_intrinsic_fminh:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vminsh %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x76,0x08,0x5d,0xd0]
-; CHECK-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovsh %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd1]
-; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fminh:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vminsh %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x76,0x08,0x5d,0xd0]
+; HasVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd1]
+; HasVL-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fminh:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vminsh %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x76,0x08,0x5d,0xd0]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd1]
+; NOVL-NEXT:    vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %z = call half @llvm.minnum.f16(half %x, half %y) readnone
   ret half %z
 }
 
 define <2 x half> @test_intrinsic_fmin_v2f16(<2 x half> %x, <2 x half> %y) {
-; CHECK-LABEL: test_intrinsic_fmin_v2f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vminph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5d,0xd0]
-; CHECK-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fmin_v2f16:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vminph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5d,0xd0]
+; HasVL-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
+; HasVL-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fmin_v2f16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vpsrldq $14, %xmm0, %xmm2 # encoding: [0xc5,0xe9,0x73,0xd8,0x0e]
+; NOVL-NEXT:    # xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $14, %xmm1, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd9,0x0e]
+; NOVL-NEXT:    # xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vminsh %xmm2, %xmm3, %xmm4 # encoding: [0x62,0xf5,0x66,0x08,0x5d,0xe2]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe3]
+; NOVL-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm2 # encoding: [0xc5,0xf8,0xc6,0xd0,0xff]
+; NOVL-NEXT:    # xmm2 = xmm0[3,3,3,3]
+; NOVL-NEXT:    vpshufd $255, %xmm1, %xmm3 # encoding: [0xc5,0xf9,0x70,0xd9,0xff]
+; NOVL-NEXT:    # xmm3 = xmm1[3,3,3,3]
+; NOVL-NEXT:    vminsh %xmm2, %xmm3, %xmm5 # encoding: [0x62,0xf5,0x66,0x08,0x5d,0xea]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xeb]
+; NOVL-NEXT:    vpunpcklwd %xmm4, %xmm5, %xmm2 # encoding: [0xc5,0xd1,0x61,0xd4]
+; NOVL-NEXT:    # xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; NOVL-NEXT:    vpsrldq $10, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd8,0x0a]
+; NOVL-NEXT:    # xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $10, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd9,0x0a]
+; NOVL-NEXT:    # xmm4 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vminsh %xmm3, %xmm4, %xmm5 # encoding: [0x62,0xf5,0x5e,0x08,0x5d,0xeb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xec]
+; NOVL-NEXT:    vshufpd $1, %xmm0, %xmm0, %xmm3 # encoding: [0xc5,0xf9,0xc6,0xd8,0x01]
+; NOVL-NEXT:    # xmm3 = xmm0[1,0]
+; NOVL-NEXT:    vshufpd $1, %xmm1, %xmm1, %xmm4 # encoding: [0xc5,0xf1,0xc6,0xe1,0x01]
+; NOVL-NEXT:    # xmm4 = xmm1[1,0]
+; NOVL-NEXT:    vminsh %xmm3, %xmm4, %xmm6 # encoding: [0x62,0xf5,0x5e,0x08,0x5d,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf4]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm6, %xmm3 # encoding: [0xc5,0xc9,0x61,0xdd]
+; NOVL-NEXT:    # xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; NOVL-NEXT:    vpunpckldq %xmm2, %xmm3, %xmm2 # encoding: [0xc5,0xe1,0x62,0xd2]
+; NOVL-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd0,0x30]
+; NOVL-NEXT:    vpsrlq $48, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd1,0x30]
+; NOVL-NEXT:    vminsh %xmm3, %xmm4, %xmm5 # encoding: [0x62,0xf5,0x5e,0x08,0x5d,0xeb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xec]
+; NOVL-NEXT:    vmovshdup %xmm0, %xmm3 # encoding: [0xc5,0xfa,0x16,0xd8]
+; NOVL-NEXT:    # xmm3 = xmm0[1,1,3,3]
+; NOVL-NEXT:    vmovshdup %xmm1, %xmm4 # encoding: [0xc5,0xfa,0x16,0xe1]
+; NOVL-NEXT:    # xmm4 = xmm1[1,1,3,3]
+; NOVL-NEXT:    vminsh %xmm3, %xmm4, %xmm6 # encoding: [0x62,0xf5,0x5e,0x08,0x5d,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf4]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm6, %xmm3 # encoding: [0xc5,0xc9,0x61,0xdd]
+; NOVL-NEXT:    # xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; NOVL-NEXT:    vminsh %xmm0, %xmm1, %xmm4 # encoding: [0x62,0xf5,0x76,0x08,0x5d,0xe0]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe1]
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xd0,0x10]
+; NOVL-NEXT:    vpsrld $16, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x72,0xd1,0x10]
+; NOVL-NEXT:    vminsh %xmm0, %xmm1, %xmm5 # encoding: [0x62,0xf5,0x76,0x08,0x5d,0xe8]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe9]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm4, %xmm0 # encoding: [0xc5,0xd9,0x61,0xc5]
+; NOVL-NEXT:    # xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; NOVL-NEXT:    vpunpckldq %xmm3, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x62,0xc3]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; NOVL-NEXT:    vpunpcklqdq %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x6c,0xc2]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm2[0]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %z = call <2 x half> @llvm.minnum.v2f16(<2 x half> %x, <2 x half> %y) readnone
   ret <2 x half> %z
 }
 
 define <4 x half> @test_intrinsic_fmin_v4f16(<4 x half> %x, <4 x half> %y) {
-; CHECK-LABEL: test_intrinsic_fmin_v4f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vminph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5d,0xd0]
-; CHECK-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fmin_v4f16:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vminph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5d,0xd0]
+; HasVL-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
+; HasVL-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fmin_v4f16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vpsrldq $14, %xmm0, %xmm2 # encoding: [0xc5,0xe9,0x73,0xd8,0x0e]
+; NOVL-NEXT:    # xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $14, %xmm1, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd9,0x0e]
+; NOVL-NEXT:    # xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vminsh %xmm2, %xmm3, %xmm4 # encoding: [0x62,0xf5,0x66,0x08,0x5d,0xe2]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe3]
+; NOVL-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm2 # encoding: [0xc5,0xf8,0xc6,0xd0,0xff]
+; NOVL-NEXT:    # xmm2 = xmm0[3,3,3,3]
+; NOVL-NEXT:    vpshufd $255, %xmm1, %xmm3 # encoding: [0xc5,0xf9,0x70,0xd9,0xff]
+; NOVL-NEXT:    # xmm3 = xmm1[3,3,3,3]
+; NOVL-NEXT:    vminsh %xmm2, %xmm3, %xmm5 # encoding: [0x62,0xf5,0x66,0x08,0x5d,0xea]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xeb]
+; NOVL-NEXT:    vpunpcklwd %xmm4, %xmm5, %xmm2 # encoding: [0xc5,0xd1,0x61,0xd4]
+; NOVL-NEXT:    # xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; NOVL-NEXT:    vpsrldq $10, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd8,0x0a]
+; NOVL-NEXT:    # xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $10, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd9,0x0a]
+; NOVL-NEXT:    # xmm4 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vminsh %xmm3, %xmm4, %xmm5 # encoding: [0x62,0xf5,0x5e,0x08,0x5d,0xeb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xec]
+; NOVL-NEXT:    vshufpd $1, %xmm0, %xmm0, %xmm3 # encoding: [0xc5,0xf9,0xc6,0xd8,0x01]
+; NOVL-NEXT:    # xmm3 = xmm0[1,0]
+; NOVL-NEXT:    vshufpd $1, %xmm1, %xmm1, %xmm4 # encoding: [0xc5,0xf1,0xc6,0xe1,0x01]
+; NOVL-NEXT:    # xmm4 = xmm1[1,0]
+; NOVL-NEXT:    vminsh %xmm3, %xmm4, %xmm6 # encoding: [0x62,0xf5,0x5e,0x08,0x5d,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf4]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm6, %xmm3 # encoding: [0xc5,0xc9,0x61,0xdd]
+; NOVL-NEXT:    # xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; NOVL-NEXT:    vpunpckldq %xmm2, %xmm3, %xmm2 # encoding: [0xc5,0xe1,0x62,0xd2]
+; NOVL-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd0,0x30]
+; NOVL-NEXT:    vpsrlq $48, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd1,0x30]
+; NOVL-NEXT:    vminsh %xmm3, %xmm4, %xmm5 # encoding: [0x62,0xf5,0x5e,0x08,0x5d,0xeb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xec]
+; NOVL-NEXT:    vmovshdup %xmm0, %xmm3 # encoding: [0xc5,0xfa,0x16,0xd8]
+; NOVL-NEXT:    # xmm3 = xmm0[1,1,3,3]
+; NOVL-NEXT:    vmovshdup %xmm1, %xmm4 # encoding: [0xc5,0xfa,0x16,0xe1]
+; NOVL-NEXT:    # xmm4 = xmm1[1,1,3,3]
+; NOVL-NEXT:    vminsh %xmm3, %xmm4, %xmm6 # encoding: [0x62,0xf5,0x5e,0x08,0x5d,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf4]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm6, %xmm3 # encoding: [0xc5,0xc9,0x61,0xdd]
+; NOVL-NEXT:    # xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; NOVL-NEXT:    vminsh %xmm0, %xmm1, %xmm4 # encoding: [0x62,0xf5,0x76,0x08,0x5d,0xe0]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe1]
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xd0,0x10]
+; NOVL-NEXT:    vpsrld $16, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x72,0xd1,0x10]
+; NOVL-NEXT:    vminsh %xmm0, %xmm1, %xmm5 # encoding: [0x62,0xf5,0x76,0x08,0x5d,0xe8]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe9]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm4, %xmm0 # encoding: [0xc5,0xd9,0x61,0xc5]
+; NOVL-NEXT:    # xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; NOVL-NEXT:    vpunpckldq %xmm3, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x62,0xc3]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; NOVL-NEXT:    vpunpcklqdq %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x6c,0xc2]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm2[0]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %z = call <4 x half> @llvm.minnum.v4f16(<4 x half> %x, <4 x half> %y) readnone
   ret <4 x half> %z
 }
 
 define <8 x half> @test_intrinsic_fmin_v8f16(<8 x half> %x, <8 x half> %y) {
-; CHECK-LABEL: test_intrinsic_fmin_v8f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vminph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5d,0xd0]
-; CHECK-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fmin_v8f16:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vminph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5d,0xd0]
+; HasVL-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
+; HasVL-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fmin_v8f16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vpsrldq $14, %xmm0, %xmm2 # encoding: [0xc5,0xe9,0x73,0xd8,0x0e]
+; NOVL-NEXT:    # xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $14, %xmm1, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd9,0x0e]
+; NOVL-NEXT:    # xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vminsh %xmm2, %xmm3, %xmm4 # encoding: [0x62,0xf5,0x66,0x08,0x5d,0xe2]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe3]
+; NOVL-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm2 # encoding: [0xc5,0xf8,0xc6,0xd0,0xff]
+; NOVL-NEXT:    # xmm2 = xmm0[3,3,3,3]
+; NOVL-NEXT:    vpshufd $255, %xmm1, %xmm3 # encoding: [0xc5,0xf9,0x70,0xd9,0xff]
+; NOVL-NEXT:    # xmm3 = xmm1[3,3,3,3]
+; NOVL-NEXT:    vminsh %xmm2, %xmm3, %xmm5 # encoding: [0x62,0xf5,0x66,0x08,0x5d,0xea]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xeb]
+; NOVL-NEXT:    vpunpcklwd %xmm4, %xmm5, %xmm2 # encoding: [0xc5,0xd1,0x61,0xd4]
+; NOVL-NEXT:    # xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; NOVL-NEXT:    vpsrldq $10, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd8,0x0a]
+; NOVL-NEXT:    # xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $10, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd9,0x0a]
+; NOVL-NEXT:    # xmm4 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vminsh %xmm3, %xmm4, %xmm5 # encoding: [0x62,0xf5,0x5e,0x08,0x5d,0xeb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xec]
+; NOVL-NEXT:    vshufpd $1, %xmm0, %xmm0, %xmm3 # encoding: [0xc5,0xf9,0xc6,0xd8,0x01]
+; NOVL-NEXT:    # xmm3 = xmm0[1,0]
+; NOVL-NEXT:    vshufpd $1, %xmm1, %xmm1, %xmm4 # encoding: [0xc5,0xf1,0xc6,0xe1,0x01]
+; NOVL-NEXT:    # xmm4 = xmm1[1,0]
+; NOVL-NEXT:    vminsh %xmm3, %xmm4, %xmm6 # encoding: [0x62,0xf5,0x5e,0x08,0x5d,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf4]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm6, %xmm3 # encoding: [0xc5,0xc9,0x61,0xdd]
+; NOVL-NEXT:    # xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; NOVL-NEXT:    vpunpckldq %xmm2, %xmm3, %xmm2 # encoding: [0xc5,0xe1,0x62,0xd2]
+; NOVL-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd0,0x30]
+; NOVL-NEXT:    vpsrlq $48, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd1,0x30]
+; NOVL-NEXT:    vminsh %xmm3, %xmm4, %xmm5 # encoding: [0x62,0xf5,0x5e,0x08,0x5d,0xeb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xec]
+; NOVL-NEXT:    vmovshdup %xmm0, %xmm3 # encoding: [0xc5,0xfa,0x16,0xd8]
+; NOVL-NEXT:    # xmm3 = xmm0[1,1,3,3]
+; NOVL-NEXT:    vmovshdup %xmm1, %xmm4 # encoding: [0xc5,0xfa,0x16,0xe1]
+; NOVL-NEXT:    # xmm4 = xmm1[1,1,3,3]
+; NOVL-NEXT:    vminsh %xmm3, %xmm4, %xmm6 # encoding: [0x62,0xf5,0x5e,0x08,0x5d,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf4]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm6, %xmm3 # encoding: [0xc5,0xc9,0x61,0xdd]
+; NOVL-NEXT:    # xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; NOVL-NEXT:    vminsh %xmm0, %xmm1, %xmm4 # encoding: [0x62,0xf5,0x76,0x08,0x5d,0xe0]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe1]
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xd0,0x10]
+; NOVL-NEXT:    vpsrld $16, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x72,0xd1,0x10]
+; NOVL-NEXT:    vminsh %xmm0, %xmm1, %xmm5 # encoding: [0x62,0xf5,0x76,0x08,0x5d,0xe8]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe9]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm4, %xmm0 # encoding: [0xc5,0xd9,0x61,0xc5]
+; NOVL-NEXT:    # xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; NOVL-NEXT:    vpunpckldq %xmm3, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x62,0xc3]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; NOVL-NEXT:    vpunpcklqdq %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x6c,0xc2]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm2[0]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %z = call <8 x half> @llvm.minnum.v8f16(<8 x half> %x, <8 x half> %y) readnone
   ret <8 x half> %z
 }
 
 define <16 x half> @test_intrinsic_fmin_v16f16(<16 x half> %x, <16 x half> %y) {
-; CHECK-LABEL: test_intrinsic_fmin_v16f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vminph %ymm0, %ymm1, %ymm2 # encoding: [0x62,0xf5,0x74,0x28,0x5d,0xd0]
-; CHECK-NEXT:    vcmpunordph %ymm0, %ymm0, %k1 # encoding: [0x62,0xf3,0x7c,0x28,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovdqu16 %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0xd1]
-; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fmin_v16f16:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vminph %ymm0, %ymm1, %ymm2 # encoding: [0x62,0xf5,0x74,0x28,0x5d,0xd0]
+; HasVL-NEXT:    vcmpunordph %ymm0, %ymm0, %k1 # encoding: [0x62,0xf3,0x7c,0x28,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovdqu16 %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0xd1]
+; HasVL-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fmin_v16f16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vextracti128 $1, %ymm0, %xmm2 # encoding: [0xc4,0xe3,0x7d,0x39,0xc2,0x01]
+; NOVL-NEXT:    vpsrldq $14, %xmm2, %xmm4 # encoding: [0xc5,0xd9,0x73,0xda,0x0e]
+; NOVL-NEXT:    # xmm4 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vextracti128 $1, %ymm1, %xmm3 # encoding: [0xc4,0xe3,0x7d,0x39,0xcb,0x01]
+; NOVL-NEXT:    vpsrldq $14, %xmm3, %xmm5 # encoding: [0xc5,0xd1,0x73,0xdb,0x0e]
+; NOVL-NEXT:    # xmm5 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vminsh %xmm4, %xmm5, %xmm6 # encoding: [0x62,0xf5,0x56,0x08,0x5d,0xf4]
+; NOVL-NEXT:    vcmpunordsh %xmm4, %xmm4, %k1 # encoding: [0x62,0xf3,0x5e,0x08,0xc2,0xcc,0x03]
+; NOVL-NEXT:    vmovsh %xmm5, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf5]
+; NOVL-NEXT:    vpshufd $255, %xmm2, %xmm4 # encoding: [0xc5,0xf9,0x70,0xe2,0xff]
+; NOVL-NEXT:    # xmm4 = xmm2[3,3,3,3]
+; NOVL-NEXT:    vpshufd $255, %xmm3, %xmm5 # encoding: [0xc5,0xf9,0x70,0xeb,0xff]
+; NOVL-NEXT:    # xmm5 = xmm3[3,3,3,3]
+; NOVL-NEXT:    vminsh %xmm4, %xmm5, %xmm7 # encoding: [0x62,0xf5,0x56,0x08,0x5d,0xfc]
+; NOVL-NEXT:    vcmpunordsh %xmm4, %xmm4, %k1 # encoding: [0x62,0xf3,0x5e,0x08,0xc2,0xcc,0x03]
+; NOVL-NEXT:    vmovsh %xmm5, %xmm0, %xmm7 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xfd]
+; NOVL-NEXT:    vpunpcklwd %xmm6, %xmm7, %xmm4 # encoding: [0xc5,0xc1,0x61,0xe6]
+; NOVL-NEXT:    # xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; NOVL-NEXT:    vpsrldq $10, %xmm2, %xmm5 # encoding: [0xc5,0xd1,0x73,0xda,0x0a]
+; NOVL-NEXT:    # xmm5 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $10, %xmm3, %xmm6 # encoding: [0xc5,0xc9,0x73,0xdb,0x0a]
+; NOVL-NEXT:    # xmm6 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vminsh %xmm5, %xmm6, %xmm7 # encoding: [0x62,0xf5,0x4e,0x08,0x5d,0xfd]
+; NOVL-NEXT:    vcmpunordsh %xmm5, %xmm5, %k1 # encoding: [0x62,0xf3,0x56,0x08,0xc2,0xcd,0x03]
+; NOVL-NEXT:    vmovsh %xmm6, %xmm0, %xmm7 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xfe]
+; NOVL-NEXT:    vshufpd $1, %xmm2, %xmm2, %xmm5 # encoding: [0xc5,0xe9,0xc6,0xea,0x01]
+; NOVL-NEXT:    # xmm5 = xmm2[1,0]
+; NOVL-NEXT:    vshufpd $1, %xmm3, %xmm3, %xmm6 # encoding: [0xc5,0xe1,0xc6,0xf3,0x01]
+; NOVL-NEXT:    # xmm6 = xmm3[1,0]
+; NOVL-NEXT:    vminsh %xmm5, %xmm6, %xmm8 # encoding: [0x62,0x75,0x4e,0x08,0x5d,0xc5]
+; NOVL-NEXT:    vcmpunordsh %xmm5, %xmm5, %k1 # encoding: [0x62,0xf3,0x56,0x08,0xc2,0xcd,0x03]
+; NOVL-NEXT:    vmovsh %xmm6, %xmm0, %xmm8 {%k1} # encoding: [0x62,0x75,0x7e,0x09,0x10,0xc6]
+; NOVL-NEXT:    vpunpcklwd %xmm7, %xmm8, %xmm5 # encoding: [0xc5,0xb9,0x61,0xef]
+; NOVL-NEXT:    # xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; NOVL-NEXT:    vpunpckldq %xmm4, %xmm5, %xmm4 # encoding: [0xc5,0xd1,0x62,0xe4]
+; NOVL-NEXT:    # xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; NOVL-NEXT:    vpsrldq $14, %xmm0, %xmm5 # encoding: [0xc5,0xd1,0x73,0xd8,0x0e]
+; NOVL-NEXT:    # xmm5 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $14, %xmm1, %xmm6 # encoding: [0xc5,0xc9,0x73,0xd9,0x0e]
+; NOVL-NEXT:    # xmm6 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vminsh %xmm5, %xmm6, %xmm7 # encoding: [0x62,0xf5,0x4e,0x08,0x5d,0xfd]
+; NOVL-NEXT:    vcmpunordsh %xmm5, %xmm5, %k1 # encoding: [0x62,0xf3,0x56,0x08,0xc2,0xcd,0x03]
+; NOVL-NEXT:    vmovsh %xmm6, %xmm0, %xmm7 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xfe]
+; NOVL-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm5 # encoding: [0xc5,0xf8,0xc6,0xe8,0xff]
+; NOVL-NEXT:    # xmm5 = xmm0[3,3,3,3]
+; NOVL-NEXT:    vpshufd $255, %xmm1, %xmm6 # encoding: [0xc5,0xf9,0x70,0xf1,0xff]
+; NOVL-NEXT:    # xmm6 = xmm1[3,3,3,3]
+; NOVL-NEXT:    vminsh %xmm5, %xmm6, %xmm8 # encoding: [0x62,0x75,0x4e,0x08,0x5d,0xc5]
+; NOVL-NEXT:    vcmpunordsh %xmm5, %xmm5, %k1 # encoding: [0x62,0xf3,0x56,0x08,0xc2,0xcd,0x03]
+; NOVL-NEXT:    vmovsh %xmm6, %xmm0, %xmm8 {%k1} # encoding: [0x62,0x75,0x7e,0x09,0x10,0xc6]
+; NOVL-NEXT:    vpunpcklwd %xmm7, %xmm8, %xmm5 # encoding: [0xc5,0xb9,0x61,0xef]
+; NOVL-NEXT:    # xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; NOVL-NEXT:    vpsrldq $10, %xmm0, %xmm6 # encoding: [0xc5,0xc9,0x73,0xd8,0x0a]
+; NOVL-NEXT:    # xmm6 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $10, %xmm1, %xmm7 # encoding: [0xc5,0xc1,0x73,0xd9,0x0a]
+; NOVL-NEXT:    # xmm7 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vminsh %xmm6, %xmm7, %xmm8 # encoding: [0x62,0x75,0x46,0x08,0x5d,0xc6]
+; NOVL-NEXT:    vcmpunordsh %xmm6, %xmm6, %k1 # encoding: [0x62,0xf3,0x4e,0x08,0xc2,0xce,0x03]
+; NOVL-NEXT:    vmovsh %xmm7, %xmm0, %xmm8 {%k1} # encoding: [0x62,0x75,0x7e,0x09,0x10,0xc7]
+; NOVL-NEXT:    vshufpd $1, %xmm0, %xmm0, %xmm6 # encoding: [0xc5,0xf9,0xc6,0xf0,0x01]
+; NOVL-NEXT:    # xmm6 = xmm0[1,0]
+; NOVL-NEXT:    vshufpd $1, %xmm1, %xmm1, %xmm7 # encoding: [0xc5,0xf1,0xc6,0xf9,0x01]
+; NOVL-NEXT:    # xmm7 = xmm1[1,0]
+; NOVL-NEXT:    vminsh %xmm6, %xmm7, %xmm9 # encoding: [0x62,0x75,0x46,0x08,0x5d,0xce]
+; NOVL-NEXT:    vcmpunordsh %xmm6, %xmm6, %k1 # encoding: [0x62,0xf3,0x4e,0x08,0xc2,0xce,0x03]
+; NOVL-NEXT:    vmovsh %xmm7, %xmm0, %xmm9 {%k1} # encoding: [0x62,0x75,0x7e,0x09,0x10,0xcf]
+; NOVL-NEXT:    vpunpcklwd %xmm8, %xmm9, %xmm6 # encoding: [0xc4,0xc1,0x31,0x61,0xf0]
+; NOVL-NEXT:    # xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
+; NOVL-NEXT:    vpunpckldq %xmm5, %xmm6, %xmm5 # encoding: [0xc5,0xc9,0x62,0xed]
+; NOVL-NEXT:    # xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; NOVL-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4 # encoding: [0xc4,0xe3,0x55,0x38,0xe4,0x01]
+; NOVL-NEXT:    vpsrlq $48, %xmm2, %xmm5 # encoding: [0xc5,0xd1,0x73,0xd2,0x30]
+; NOVL-NEXT:    vpsrlq $48, %xmm3, %xmm6 # encoding: [0xc5,0xc9,0x73,0xd3,0x30]
+; NOVL-NEXT:    vminsh %xmm5, %xmm6, %xmm7 # encoding: [0x62,0xf5,0x4e,0x08,0x5d,0xfd]
+; NOVL-NEXT:    vcmpunordsh %xmm5, %xmm5, %k1 # encoding: [0x62,0xf3,0x56,0x08,0xc2,0xcd,0x03]
+; NOVL-NEXT:    vmovsh %xmm6, %xmm0, %xmm7 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xfe]
+; NOVL-NEXT:    vmovshdup %xmm2, %xmm5 # encoding: [0xc5,0xfa,0x16,0xea]
+; NOVL-NEXT:    # xmm5 = xmm2[1,1,3,3]
+; NOVL-NEXT:    vmovshdup %xmm3, %xmm6 # encoding: [0xc5,0xfa,0x16,0xf3]
+; NOVL-NEXT:    # xmm6 = xmm3[1,1,3,3]
+; NOVL-NEXT:    vminsh %xmm5, %xmm6, %xmm8 # encoding: [0x62,0x75,0x4e,0x08,0x5d,0xc5]
+; NOVL-NEXT:    vcmpunordsh %xmm5, %xmm5, %k1 # encoding: [0x62,0xf3,0x56,0x08,0xc2,0xcd,0x03]
+; NOVL-NEXT:    vmovsh %xmm6, %xmm0, %xmm8 {%k1} # encoding: [0x62,0x75,0x7e,0x09,0x10,0xc6]
+; NOVL-NEXT:    vpunpcklwd %xmm7, %xmm8, %xmm5 # encoding: [0xc5,0xb9,0x61,0xef]
+; NOVL-NEXT:    # xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; NOVL-NEXT:    vminsh %xmm2, %xmm3, %xmm6 # encoding: [0x62,0xf5,0x66,0x08,0x5d,0xf2]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf3]
+; NOVL-NEXT:    vpsrld $16, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0x72,0xd2,0x10]
+; NOVL-NEXT:    vpsrld $16, %xmm3, %xmm3 # encoding: [0xc5,0xe1,0x72,0xd3,0x10]
+; NOVL-NEXT:    vminsh %xmm2, %xmm3, %xmm7 # encoding: [0x62,0xf5,0x66,0x08,0x5d,0xfa]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm7 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xfb]
+; NOVL-NEXT:    vpunpcklwd %xmm7, %xmm6, %xmm2 # encoding: [0xc5,0xc9,0x61,0xd7]
+; NOVL-NEXT:    # xmm2 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; NOVL-NEXT:    vpunpckldq %xmm5, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0x62,0xd5]
+; NOVL-NEXT:    # xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd0,0x30]
+; NOVL-NEXT:    vpsrlq $48, %xmm1, %xmm5 # encoding: [0xc5,0xd1,0x73,0xd1,0x30]
+; NOVL-NEXT:    vminsh %xmm3, %xmm5, %xmm6 # encoding: [0x62,0xf5,0x56,0x08,0x5d,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm5, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf5]
+; NOVL-NEXT:    vmovshdup %xmm0, %xmm3 # encoding: [0xc5,0xfa,0x16,0xd8]
+; NOVL-NEXT:    # xmm3 = xmm0[1,1,3,3]
+; NOVL-NEXT:    vmovshdup %xmm1, %xmm5 # encoding: [0xc5,0xfa,0x16,0xe9]
+; NOVL-NEXT:    # xmm5 = xmm1[1,1,3,3]
+; NOVL-NEXT:    vminsh %xmm3, %xmm5, %xmm7 # encoding: [0x62,0xf5,0x56,0x08,0x5d,0xfb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm5, %xmm0, %xmm7 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xfd]
+; NOVL-NEXT:    vpunpcklwd %xmm6, %xmm7, %xmm3 # encoding: [0xc5,0xc1,0x61,0xde]
+; NOVL-NEXT:    # xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; NOVL-NEXT:    vminsh %xmm0, %xmm1, %xmm5 # encoding: [0x62,0xf5,0x76,0x08,0x5d,0xe8]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe9]
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xd0,0x10]
+; NOVL-NEXT:    vpsrld $16, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x72,0xd1,0x10]
+; NOVL-NEXT:    vminsh %xmm0, %xmm1, %xmm6 # encoding: [0x62,0xf5,0x76,0x08,0x5d,0xf0]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf1]
+; NOVL-NEXT:    vpunpcklwd %xmm6, %xmm5, %xmm0 # encoding: [0xc5,0xd1,0x61,0xc6]
+; NOVL-NEXT:    # xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; NOVL-NEXT:    vpunpckldq %xmm3, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x62,0xc3]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; NOVL-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x38,0xc2,0x01]
+; NOVL-NEXT:    vpunpcklqdq %ymm4, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x6c,0xc4]
+; NOVL-NEXT:    # ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %z = call <16 x half> @llvm.minnum.v16f16(<16 x half> %x, <16 x half> %y) readnone
   ret <16 x half> %z
 }
@@ -81,10 +418,68 @@ define <32 x half> @test_intrinsic_fmin_v32f16(<32 x half> %x, <32 x half> %y) {
 }
 
 define <4 x half> @minnum_intrinsic_nnan_fmf_f432(<4 x half> %a, <4 x half> %b) {
-; CHECK-LABEL: minnum_intrinsic_nnan_fmf_f432:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vminph %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7c,0x08,0x5d,0xc1]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: minnum_intrinsic_nnan_fmf_f432:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vminph %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7c,0x08,0x5d,0xc1]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: minnum_intrinsic_nnan_fmf_f432:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vpsrldq $14, %xmm1, %xmm2 # encoding: [0xc5,0xe9,0x73,0xd9,0x0e]
+; NOVL-NEXT:    # xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $14, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd8,0x0e]
+; NOVL-NEXT:    # xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vcmpltsh %xmm2, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xca,0x01]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd3]
+; NOVL-NEXT:    vpshufd $255, %xmm1, %xmm3 # encoding: [0xc5,0xf9,0x70,0xd9,0xff]
+; NOVL-NEXT:    # xmm3 = xmm1[3,3,3,3]
+; NOVL-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0xc6,0xe0,0xff]
+; NOVL-NEXT:    # xmm4 = xmm0[3,3,3,3]
+; NOVL-NEXT:    vcmpltsh %xmm3, %xmm4, %k1 # encoding: [0x62,0xf3,0x5e,0x08,0xc2,0xcb,0x01]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xdc]
+; NOVL-NEXT:    vpunpcklwd %xmm2, %xmm3, %xmm2 # encoding: [0xc5,0xe1,0x61,0xd2]
+; NOVL-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; NOVL-NEXT:    vpsrldq $10, %xmm1, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd9,0x0a]
+; NOVL-NEXT:    # xmm3 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $10, %xmm0, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd8,0x0a]
+; NOVL-NEXT:    # xmm4 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vcmpltsh %xmm3, %xmm4, %k1 # encoding: [0x62,0xf3,0x5e,0x08,0xc2,0xcb,0x01]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xdc]
+; NOVL-NEXT:    vshufpd $1, %xmm1, %xmm1, %xmm4 # encoding: [0xc5,0xf1,0xc6,0xe1,0x01]
+; NOVL-NEXT:    # xmm4 = xmm1[1,0]
+; NOVL-NEXT:    vshufpd $1, %xmm0, %xmm0, %xmm5 # encoding: [0xc5,0xf9,0xc6,0xe8,0x01]
+; NOVL-NEXT:    # xmm5 = xmm0[1,0]
+; NOVL-NEXT:    vcmpltsh %xmm4, %xmm5, %k1 # encoding: [0x62,0xf3,0x56,0x08,0xc2,0xcc,0x01]
+; NOVL-NEXT:    vmovsh %xmm5, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe5]
+; NOVL-NEXT:    vpunpcklwd %xmm3, %xmm4, %xmm3 # encoding: [0xc5,0xd9,0x61,0xdb]
+; NOVL-NEXT:    # xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; NOVL-NEXT:    vpunpckldq %xmm2, %xmm3, %xmm2 # encoding: [0xc5,0xe1,0x62,0xd2]
+; NOVL-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; NOVL-NEXT:    vpsrlq $48, %xmm1, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd1,0x30]
+; NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd0,0x30]
+; NOVL-NEXT:    vcmpltsh %xmm3, %xmm4, %k1 # encoding: [0x62,0xf3,0x5e,0x08,0xc2,0xcb,0x01]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xdc]
+; NOVL-NEXT:    vmovshdup %xmm1, %xmm4 # encoding: [0xc5,0xfa,0x16,0xe1]
+; NOVL-NEXT:    # xmm4 = xmm1[1,1,3,3]
+; NOVL-NEXT:    vmovshdup %xmm0, %xmm5 # encoding: [0xc5,0xfa,0x16,0xe8]
+; NOVL-NEXT:    # xmm5 = xmm0[1,1,3,3]
+; NOVL-NEXT:    vcmpltsh %xmm4, %xmm5, %k1 # encoding: [0x62,0xf3,0x56,0x08,0xc2,0xcc,0x01]
+; NOVL-NEXT:    vmovsh %xmm5, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe5]
+; NOVL-NEXT:    vpunpcklwd %xmm3, %xmm4, %xmm3 # encoding: [0xc5,0xd9,0x61,0xdb]
+; NOVL-NEXT:    # xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; NOVL-NEXT:    vcmpltsh %xmm1, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc9,0x01]
+; NOVL-NEXT:    vpsrld $16, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x72,0xd1,0x10]
+; NOVL-NEXT:    vmovsh %xmm0, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xc8]
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xd0,0x10]
+; NOVL-NEXT:    vcmpltsh %xmm4, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xcc,0x01]
+; NOVL-NEXT:    vmovsh %xmm0, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe0]
+; NOVL-NEXT:    vpunpcklwd %xmm4, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0x61,0xc4]
+; NOVL-NEXT:    # xmm0 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; NOVL-NEXT:    vpunpckldq %xmm3, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x62,0xc3]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; NOVL-NEXT:    vpunpcklqdq %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x6c,0xc2]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm2[0]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %r = tail call nnan <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b)
   ret <4 x half> %r
 }
diff --git a/llvm/test/CodeGen/X86/bitselect.ll b/llvm/test/CodeGen/X86/bitselect.ll
index 2922113..4fc0827 100644
--- a/llvm/test/CodeGen/X86/bitselect.ll
+++ b/llvm/test/CodeGen/X86/bitselect.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefixes=X64,X64-NOBMI
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64,X64-BMI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefixes=X64-NOBMI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64-BMI
 
 ; PR46472
 ; bitselect(a,b,m) == or(and(a,not(m)),and(b,m))
@@ -17,14 +17,22 @@ define i8 @bitselect_i8(i8 %a, i8 %b, i8 %m) nounwind {
 ; X86-NEXT:    xorb %cl, %al
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: bitselect_i8:
-; X64:       # %bb.0:
-; X64-NEXT:    andl %edx, %esi
-; X64-NEXT:    movl %edx, %eax
-; X64-NEXT:    notb %al
-; X64-NEXT:    andb %dil, %al
-; X64-NEXT:    orb %sil, %al
-; X64-NEXT:    retq
+; X64-NOBMI-LABEL: bitselect_i8:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movl %esi, %eax
+; X64-NOBMI-NEXT:    xorl %edi, %eax
+; X64-NOBMI-NEXT:    andl %edx, %eax
+; X64-NOBMI-NEXT:    xorl %edi, %eax
+; X64-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI-LABEL: bitselect_i8:
+; X64-BMI:       # %bb.0:
+; X64-BMI-NEXT:    andnl %edi, %edx, %eax
+; X64-BMI-NEXT:    andl %edx, %esi
+; X64-BMI-NEXT:    orl %esi, %eax
+; X64-BMI-NEXT:    # kill: def $al killed $al killed $eax
+; X64-BMI-NEXT:    retq
   %not = xor i8 %m, -1
   %ma = and i8 %a, %not
   %mb = and i8 %b, %m
@@ -35,21 +43,20 @@ define i8 @bitselect_i8(i8 %a, i8 %b, i8 %m) nounwind {
 define i16 @bitselect_i16(i16 %a, i16 %b, i16 %m) nounwind {
 ; X86-LABEL: bitselect_i16:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorw %ax, %cx
-; X86-NEXT:    andw {{[0-9]+}}(%esp), %cx
+; X86-NEXT:    xorw %cx, %ax
+; X86-NEXT:    andw {{[0-9]+}}(%esp), %ax
 ; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bitselect_i16:
 ; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movl %edx, %eax
-; X64-NOBMI-NEXT:    andl %edx, %esi
-; X64-NOBMI-NEXT:    notl %eax
-; X64-NOBMI-NEXT:    andl %edi, %eax
-; X64-NOBMI-NEXT:    orl %esi, %eax
+; X64-NOBMI-NEXT:    movl %esi, %eax
+; X64-NOBMI-NEXT:    xorl %edi, %eax
+; X64-NOBMI-NEXT:    andl %edx, %eax
+; X64-NOBMI-NEXT:    xorl %edi, %eax
 ; X64-NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NOBMI-NEXT:    retq
 ;
@@ -186,13 +193,12 @@ define i128 @bitselect_i128(i128 %a, i128 %b, i128 %m) nounwind {
 ;
 ; X64-BMI-LABEL: bitselect_i128:
 ; X64-BMI:       # %bb.0:
-; X64-BMI-NEXT:    andnq %rsi, %r9, %rsi
 ; X64-BMI-NEXT:    andnq %rdi, %r8, %rax
-; X64-BMI-NEXT:    andq %r9, %rcx
-; X64-BMI-NEXT:    orq %rcx, %rsi
 ; X64-BMI-NEXT:    andq %r8, %rdx
 ; X64-BMI-NEXT:    orq %rdx, %rax
-; X64-BMI-NEXT:    movq %rsi, %rdx
+; X64-BMI-NEXT:    andnq %rsi, %r9, %rdx
+; X64-BMI-NEXT:    andq %r9, %rcx
+; X64-BMI-NEXT:    orq %rcx, %rdx
 ; X64-BMI-NEXT:    retq
   %not = xor i128 %m, -1
   %ma = and i128 %a, %not
diff --git a/llvm/test/CodeGen/X86/fold-masked-merge.ll b/llvm/test/CodeGen/X86/fold-masked-merge.ll
index b2614c5..4a4eecb 100644
--- a/llvm/test/CodeGen/X86/fold-masked-merge.ll
+++ b/llvm/test/CodeGen/X86/fold-masked-merge.ll
@@ -30,18 +30,17 @@ define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
 define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) {
 ; NOBMI-LABEL: masked_merge1:
 ; NOBMI:       # %bb.0:
-; NOBMI-NEXT:    movl %edi, %eax
-; NOBMI-NEXT:    andl %edi, %esi
-; NOBMI-NEXT:    notl %eax
-; NOBMI-NEXT:    andl %edx, %eax
-; NOBMI-NEXT:    orl %esi, %eax
+; NOBMI-NEXT:    movl %esi, %eax
+; NOBMI-NEXT:    xorl %edx, %eax
+; NOBMI-NEXT:    andl %edi, %eax
+; NOBMI-NEXT:    xorl %edx, %eax
 ; NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; NOBMI-NEXT:    retq
 ;
 ; BMI-LABEL: masked_merge1:
 ; BMI:       # %bb.0:
-; BMI-NEXT:    andl %edi, %esi
 ; BMI-NEXT:    andnl %edx, %edi, %eax
+; BMI-NEXT:    andl %edi, %esi
 ; BMI-NEXT:    orl %esi, %eax
 ; BMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; BMI-NEXT:    retq
@@ -53,20 +52,11 @@ define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) {
 }
 
 define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) {
-; NOBMI-LABEL: masked_merge2:
-; NOBMI:       # %bb.0:
-; NOBMI-NEXT:    movl %esi, %eax
-; NOBMI-NEXT:    # kill: def $al killed $al killed $eax
-; NOBMI-NEXT:    retq
-;
-; BMI-LABEL: masked_merge2:
-; BMI:       # %bb.0:
-; BMI-NEXT:    movl %edi, %eax
-; BMI-NEXT:    notb %al
-; BMI-NEXT:    andb %sil, %al
-; BMI-NEXT:    andb %dil, %sil
-; BMI-NEXT:    orb %sil, %al
-; BMI-NEXT:    retq
+; CHECK-LABEL: masked_merge2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
   %not = xor i8 %a0, -1
   %and0 = and i8 %not, %a1
   %and1 = and i8 %a1, %a0
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index 3ac4415..a7eea04 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -3443,8 +3443,6 @@ define i64 @cmp_ueq_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __unordtf2
 ; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
 ; X86-NEXT:    orb %bl, %al
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
@@ -3526,8 +3524,6 @@ define i64 @cmp_ueq_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    calll ___unordtf2
 ; WIN-X86-NEXT:    addl $32, %esp
-; WIN-X86-NEXT:    testl %eax, %eax
-; WIN-X86-NEXT:    setne %al
 ; WIN-X86-NEXT:    orb %bl, %al
 ; WIN-X86-NEXT:    jne LBB39_1
 ; WIN-X86-NEXT:  # %bb.2:
diff --git a/llvm/test/CodeGen/X86/fpcmp-soft-fp.ll b/llvm/test/CodeGen/X86/fpcmp-soft-fp.ll
index e89acc6..480a47e 100644
--- a/llvm/test/CodeGen/X86/fpcmp-soft-fp.ll
+++ b/llvm/test/CodeGen/X86/fpcmp-soft-fp.ll
@@ -99,8 +99,6 @@ entry:
 ; CHECK: calll __eqdf2
 ; CHECK: sete
 ; CHECK: calll __unorddf2
-; CHECK: setne
-; CHECK: or
 ; CHECK: retl
 
 define i1 @test11(double %d) #0 {
diff --git a/llvm/test/CodeGen/X86/musttail-varargs.ll b/llvm/test/CodeGen/X86/musttail-varargs.ll
index f9b2ce7..65cd1ed 100644
--- a/llvm/test/CodeGen/X86/musttail-varargs.ll
+++ b/llvm/test/CodeGen/X86/musttail-varargs.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX
 ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=x86_64-linux-gnux32 | FileCheck %s --check-prefix=LINUX-X32
 ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=x86_64-windows | FileCheck %s --check-prefix=WINDOWS
+; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=x86_64-uefi | FileCheck %s --check-prefix=WINDOWS
 ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=i686-windows | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
 ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=i686-windows -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE
 
diff --git a/llvm/test/CodeGen/X86/pr142937.ll b/llvm/test/CodeGen/X86/pr142937.ll
new file mode 100644
index 0000000..8be99e1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr142937.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=i686-- -O0   | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-- -O0 | FileCheck %s --check-prefix=X64
+
+define void @public_type_test() {
+; X86-LABEL: public_type_test:
+; X86:       # %bb.0: # %bb
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:  # %bb.1: # %bb1
+; X86-NEXT:    retl
+;
+; X64-LABEL: public_type_test:
+; X64:       # %bb.0: # %bb
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:  # %bb.1: # %bb1
+; X64-NEXT:    retq
+bb:
+  %call = call i1 @llvm.public.type.test(ptr null, metadata !"typeinfo")
+  br label %bb1
+
+bb1:
+  call void @llvm.assume(i1 %call)
+  ret void
+}
+
+define void @type_test() {
+; X86-LABEL: type_test:
+; X86:       # %bb.0: # %bb
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    testb $1, %al
+; X86-NEXT:    jne .LBB1_2
+; X86-NEXT:  # %bb.1: # %bb1
+; X86-NEXT:    ud1l 2(%eax), %eax
+; X86-NEXT:  .LBB1_2: # %bb2
+; X86-NEXT:    retl
+;
+; X64-LABEL: type_test:
+; X64:       # %bb.0: # %bb
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    testb $1, %al
+; X64-NEXT:    jne .LBB1_2
+; X64-NEXT:  # %bb.1: # %bb1
+; X64-NEXT:    ud1l 2(%eax), %eax
+; X64-NEXT:  .LBB1_2: # %bb2
+; X64-NEXT:    retq
+bb:
+  %call = tail call i1 @llvm.type.test(ptr null, metadata !"typeinfo")
+  br i1 %call, label %bb2, label %bb1
+
+bb1:
+  tail call void @llvm.ubsantrap(i8 2)
+  unreachable
+
+bb2:
+  ret void
+}
+
+declare i1 @llvm.public.type.test(ptr, metadata)
+
+declare void @llvm.assume(i1 noundef)
+
+declare i1 @llvm.type.test(ptr, metadata)
+
+declare void @llvm.ubsantrap(i8 immarg)
diff --git a/llvm/test/CodeGen/X86/rex-profile-test.ll b/llvm/test/CodeGen/X86/rex-profile-test.ll
new file mode 100644
index 0000000..379d8fa
--- /dev/null
+++ b/llvm/test/CodeGen/X86/rex-profile-test.ll
@@ -0,0 +1,18 @@
+;; Test that the UEFI and Windows targets set the rex64 correctly.
+; RUN: llc -mtriple x86_64-uefi %s -o - | FileCheck %s -check-prefix=REX
+; RUN: llc -mtriple x86_64-windows-msvc %s -o - | FileCheck %s -check-prefix=REX
+; RUN: llc -mtriple x86_64-unknown-linux %s -o - | FileCheck %s -check-prefix=NOREX
+
+define void @test_tailjmp(ptr %fptr) {
+; REX-LABEL:    test_tailjmp:           # @test_tailjmp
+; REX:          # %bb.0:                # %entry
+; REX-NEXT:     rex64   jmpq    *%rcx   # TAILCALL
+;
+; NOREX-LABEL:  test_tailjmp:           # @test_tailjmp
+; NOREX:        .cfi_startproc
+; NOREX-NEXT:   # %bb.0:                # %entry
+; NOREX-NEXT:   jmpq	*%rdi           # TAILCALL
+entry:
+  tail call void %fptr()
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
index 9c9d069..6a55d74 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
@@ -6,21 +6,18 @@
 define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 ; CHECK-NOBMI-LABEL: out8:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %edx, %eax
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    notb %al
-; CHECK-NOBMI-NEXT:    andb %sil, %al
-; CHECK-NOBMI-NEXT:    orb %dil, %al
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out8:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    movl %edx, %eax
+; CHECK-BMI-NEXT:    andnl %esi, %edx, %eax
 ; CHECK-BMI-NEXT:    andl %edx, %edi
-; CHECK-BMI-NEXT:    notb %al
-; CHECK-BMI-NEXT:    andb %sil, %al
-; CHECK-BMI-NEXT:    orb %dil, %al
+; CHECK-BMI-NEXT:    orl %edi, %eax
 ; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %mx = and i8 %x, %mask
@@ -33,18 +30,17 @@ define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 define i16 @out16(i16 %x, i16 %y, i16 %mask) {
 ; CHECK-NOBMI-LABEL: out16:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %edx, %eax
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    notl %eax
-; CHECK-NOBMI-NEXT:    andl %esi, %eax
-; CHECK-NOBMI-NEXT:    orl %edi, %eax
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out16:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andl %edx, %edi
 ; CHECK-BMI-NEXT:    andnl %esi, %edx, %eax
+; CHECK-BMI-NEXT:    andl %edx, %edi
 ; CHECK-BMI-NEXT:    orl %edi, %eax
 ; CHECK-BMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-BMI-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
index b1194be..809c158 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
@@ -16,11 +16,10 @@
 define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    andl %edx, %edi
-; CHECK-NEXT:    notb %al
-; CHECK-NEXT:    andb %sil, %al
-; CHECK-NEXT:    orb %dil, %al
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    andl %edx, %eax
+; CHECK-NEXT:    xorl %esi, %eax
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %mx = and <1 x i8> %x, %mask
@@ -37,32 +36,28 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v2i8:
 ; CHECK-BASELINE:       # %bb.0:
-; CHECK-BASELINE-NEXT:    movl %r8d, %eax
+; CHECK-BASELINE-NEXT:    movl %edi, %eax
+; CHECK-BASELINE-NEXT:    xorl %edx, %eax
+; CHECK-BASELINE-NEXT:    andl %r8d, %eax
+; CHECK-BASELINE-NEXT:    xorl %edx, %eax
+; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
 ; CHECK-BASELINE-NEXT:    andl %r9d, %esi
-; CHECK-BASELINE-NEXT:    andl %r8d, %edi
-; CHECK-BASELINE-NEXT:    notb %al
-; CHECK-BASELINE-NEXT:    notb %r9b
-; CHECK-BASELINE-NEXT:    andb %cl, %r9b
-; CHECK-BASELINE-NEXT:    andb %dl, %al
-; CHECK-BASELINE-NEXT:    orb %dil, %al
-; CHECK-BASELINE-NEXT:    orb %sil, %r9b
+; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
 ; CHECK-BASELINE-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-BASELINE-NEXT:    movl %r9d, %edx
+; CHECK-BASELINE-NEXT:    movl %esi, %edx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v2i8:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    movl %r8d, %eax
+; CHECK-SSE1-NEXT:    movl %edi, %eax
+; CHECK-SSE1-NEXT:    xorl %edx, %eax
+; CHECK-SSE1-NEXT:    andl %r8d, %eax
+; CHECK-SSE1-NEXT:    xorl %edx, %eax
+; CHECK-SSE1-NEXT:    xorl %ecx, %esi
 ; CHECK-SSE1-NEXT:    andl %r9d, %esi
-; CHECK-SSE1-NEXT:    andl %r8d, %edi
-; CHECK-SSE1-NEXT:    notb %al
-; CHECK-SSE1-NEXT:    notb %r9b
-; CHECK-SSE1-NEXT:    andb %cl, %r9b
-; CHECK-SSE1-NEXT:    andb %dl, %al
-; CHECK-SSE1-NEXT:    orb %dil, %al
-; CHECK-SSE1-NEXT:    orb %sil, %r9b
+; CHECK-SSE1-NEXT:    xorl %ecx, %esi
 ; CHECK-SSE1-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-SSE1-NEXT:    movl %r9d, %edx
+; CHECK-SSE1-NEXT:    movl %esi, %edx
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v2i8:
@@ -86,11 +81,10 @@ define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    andl %edx, %edi
-; CHECK-NEXT:    notl %eax
-; CHECK-NEXT:    andl %esi, %eax
-; CHECK-NEXT:    orl %edi, %eax
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    andl %edx, %eax
+; CHECK-NEXT:    xorl %esi, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %mx = and <1 x i16> %x, %mask
@@ -235,32 +229,28 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi
 define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v2i16:
 ; CHECK-BASELINE:       # %bb.0:
-; CHECK-BASELINE-NEXT:    movl %r8d, %eax
+; CHECK-BASELINE-NEXT:    movl %edi, %eax
+; CHECK-BASELINE-NEXT:    xorl %edx, %eax
+; CHECK-BASELINE-NEXT:    andl %r8d, %eax
+; CHECK-BASELINE-NEXT:    xorl %edx, %eax
+; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
 ; CHECK-BASELINE-NEXT:    andl %r9d, %esi
-; CHECK-BASELINE-NEXT:    andl %r8d, %edi
-; CHECK-BASELINE-NEXT:    notl %eax
-; CHECK-BASELINE-NEXT:    notl %r9d
-; CHECK-BASELINE-NEXT:    andl %ecx, %r9d
-; CHECK-BASELINE-NEXT:    orl %esi, %r9d
-; CHECK-BASELINE-NEXT:    andl %edx, %eax
-; CHECK-BASELINE-NEXT:    orl %edi, %eax
+; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
 ; CHECK-BASELINE-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-BASELINE-NEXT:    movl %r9d, %edx
+; CHECK-BASELINE-NEXT:    movl %esi, %edx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v2i16:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    movl %r8d, %eax
+; CHECK-SSE1-NEXT:    movl %edi, %eax
+; CHECK-SSE1-NEXT:    xorl %edx, %eax
+; CHECK-SSE1-NEXT:    andl %r8d, %eax
+; CHECK-SSE1-NEXT:    xorl %edx, %eax
+; CHECK-SSE1-NEXT:    xorl %ecx, %esi
 ; CHECK-SSE1-NEXT:    andl %r9d, %esi
-; CHECK-SSE1-NEXT:    andl %r8d, %edi
-; CHECK-SSE1-NEXT:    notl %eax
-; CHECK-SSE1-NEXT:    notl %r9d
-; CHECK-SSE1-NEXT:    andl %ecx, %r9d
-; CHECK-SSE1-NEXT:    orl %esi, %r9d
-; CHECK-SSE1-NEXT:    andl %edx, %eax
-; CHECK-SSE1-NEXT:    orl %edi, %eax
+; CHECK-SSE1-NEXT:    xorl %ecx, %esi
 ; CHECK-SSE1-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-SSE1-NEXT:    movl %r9d, %edx
+; CHECK-SSE1-NEXT:    movl %esi, %edx
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v2i16:
@@ -439,9 +429,12 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
 ; CHECK-BASELINE-LABEL: out_v4i16:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    xorl %r11d, %edx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-BASELINE-NEXT:    xorl %r11d, %edx
@@ -451,21 +444,21 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
-; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
 ; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
 ; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
+; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i16:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
 ; CHECK-SSE1-NEXT:    xorl %r11d, %edx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-SSE1-NEXT:    xorl %r11d, %edx
@@ -475,13 +468,10 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
 ; CHECK-SSE1-NEXT:    xorl %edi, %r8d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-SSE1-NEXT:    xorl %edi, %r8d
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
-; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
 ; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
 ; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
+; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v4i16:
@@ -506,43 +496,43 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n
 ; CHECK-BASELINE-LABEL: out_v4i16_undef:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    xorl %r10d, %edx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-BASELINE-NEXT:    xorl %r10d, %edx
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
-; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
 ; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
+; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i16_undef:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
 ; CHECK-SSE1-NEXT:    xorl %r10d, %edx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-SSE1-NEXT:    xorl %r10d, %edx
 ; CHECK-SSE1-NEXT:    xorl %edi, %r8d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-SSE1-NEXT:    xorl %edi, %r8d
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
 ; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
-; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
 ; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
+; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v4i16_undef:
@@ -883,14 +873,14 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebp
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r15d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r12d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebp
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r15d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r12d
 ; CHECK-BASELINE-NEXT:    xorl %r12d, %esi
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
 ; CHECK-BASELINE-NEXT:    xorl %r12d, %esi
@@ -906,16 +896,16 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-BASELINE-NEXT:    xorl %ebx, %r9d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
 ; CHECK-BASELINE-NEXT:    xorl %ebx, %r9d
-; CHECK-BASELINE-NEXT:    movl %r11d, %ebx
-; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %bx
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    xorw %r11w, %bx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %bx
 ; CHECK-BASELINE-NEXT:    xorl %r11d, %ebx
-; CHECK-BASELINE-NEXT:    movl %r10d, %r11d
-; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %r11w
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT:    xorw %r10w, %r11w
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
 ; CHECK-BASELINE-NEXT:    xorl %r10d, %r11d
-; CHECK-BASELINE-NEXT:    movl %edi, %r10d
-; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %r10w
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    xorw %di, %r10w
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r10d
 ; CHECK-BASELINE-NEXT:    movw %r10w, 14(%rax)
@@ -941,14 +931,14 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebp
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r15d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r12d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebp
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r15d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r12d
 ; CHECK-SSE1-NEXT:    xorl %r12d, %esi
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
 ; CHECK-SSE1-NEXT:    xorl %r12d, %esi
@@ -964,16 +954,16 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-SSE1-NEXT:    xorl %ebx, %r9d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
 ; CHECK-SSE1-NEXT:    xorl %ebx, %r9d
-; CHECK-SSE1-NEXT:    movl %r11d, %ebx
-; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %bx
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    xorw %r11w, %bx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %bx
 ; CHECK-SSE1-NEXT:    xorl %r11d, %ebx
-; CHECK-SSE1-NEXT:    movl %r10d, %r11d
-; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %r11w
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT:    xorw %r10w, %r11w
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
 ; CHECK-SSE1-NEXT:    xorl %r10d, %r11d
-; CHECK-SSE1-NEXT:    movl %edi, %r10d
-; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %r10w
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    xorw %di, %r10w
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
 ; CHECK-SSE1-NEXT:    xorl %edi, %r10d
 ; CHECK-SSE1-NEXT:    movw %r10w, 14(%rax)
@@ -1759,113 +1749,117 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-BASELINE-NEXT:    pushq %r13
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
-; CHECK-BASELINE-NEXT:    movzwl 18(%rdx), %r15d
-; CHECK-BASELINE-NEXT:    movzwl 16(%rdx), %r14d
-; CHECK-BASELINE-NEXT:    movzwl 14(%rdx), %ebp
-; CHECK-BASELINE-NEXT:    movzwl 12(%rdx), %ebx
-; CHECK-BASELINE-NEXT:    movzwl 10(%rdx), %r13d
-; CHECK-BASELINE-NEXT:    movzwl 8(%rdx), %r11d
-; CHECK-BASELINE-NEXT:    movzwl 6(%rdx), %r10d
-; CHECK-BASELINE-NEXT:    movzwl 4(%rdx), %r9d
-; CHECK-BASELINE-NEXT:    movzwl (%rdx), %r8d
-; CHECK-BASELINE-NEXT:    movzwl 2(%rdx), %r12d
-; CHECK-BASELINE-NEXT:    movzwl (%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r8w, %ax
-; CHECK-BASELINE-NEXT:    andw (%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r8d
-; CHECK-BASELINE-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 2(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r12w, %ax
-; CHECK-BASELINE-NEXT:    andw 2(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r12d
-; CHECK-BASELINE-NEXT:    movzwl 4(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r9w, %ax
-; CHECK-BASELINE-NEXT:    andw 4(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r9d
-; CHECK-BASELINE-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 6(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r10w, %ax
-; CHECK-BASELINE-NEXT:    andw 6(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r10d
-; CHECK-BASELINE-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 8(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r11w, %ax
-; CHECK-BASELINE-NEXT:    andw 8(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r11d
-; CHECK-BASELINE-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 10(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r13w, %ax
-; CHECK-BASELINE-NEXT:    andw 10(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r13d
-; CHECK-BASELINE-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 12(%rsi), %eax
+; CHECK-BASELINE-NEXT:    movq %rcx, %r10
+; CHECK-BASELINE-NEXT:    movq %rdx, %r8
+; CHECK-BASELINE-NEXT:    movq %rsi, %r9
+; CHECK-BASELINE-NEXT:    movq %rdi, %r11
+; CHECK-BASELINE-NEXT:    movzwl 18(%rdx), %ebp
+; CHECK-BASELINE-NEXT:    movl 16(%rdx), %r15d
+; CHECK-BASELINE-NEXT:    movzwl 14(%rdx), %r13d
+; CHECK-BASELINE-NEXT:    movl 12(%rdx), %r12d
+; CHECK-BASELINE-NEXT:    movzwl 10(%rdx), %r14d
+; CHECK-BASELINE-NEXT:    movl 8(%rdx), %ebx
+; CHECK-BASELINE-NEXT:    movzwl 6(%rdx), %eax
+; CHECK-BASELINE-NEXT:    movl (%rdx), %ecx
+; CHECK-BASELINE-NEXT:    movl 4(%rdx), %edx
+; CHECK-BASELINE-NEXT:    movzwl 2(%r8), %esi
+; CHECK-BASELINE-NEXT:    movzwl (%r9), %edi
+; CHECK-BASELINE-NEXT:    xorw %cx, %di
+; CHECK-BASELINE-NEXT:    andw (%r10), %di
+; CHECK-BASELINE-NEXT:    xorl %ecx, %edi
+; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 2(%r9), %ecx
+; CHECK-BASELINE-NEXT:    xorw %si, %cx
+; CHECK-BASELINE-NEXT:    andw 2(%r10), %cx
+; CHECK-BASELINE-NEXT:    xorl %esi, %ecx
+; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 4(%r9), %ecx
+; CHECK-BASELINE-NEXT:    xorw %dx, %cx
+; CHECK-BASELINE-NEXT:    andw 4(%r10), %cx
+; CHECK-BASELINE-NEXT:    xorl %edx, %ecx
+; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 6(%r9), %ecx
+; CHECK-BASELINE-NEXT:    xorw %ax, %cx
+; CHECK-BASELINE-NEXT:    andw 6(%r10), %cx
+; CHECK-BASELINE-NEXT:    xorl %eax, %ecx
+; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 8(%r9), %eax
 ; CHECK-BASELINE-NEXT:    xorw %bx, %ax
-; CHECK-BASELINE-NEXT:    andw 12(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %ebx
-; CHECK-BASELINE-NEXT:    movzwl 14(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %bp, %ax
-; CHECK-BASELINE-NEXT:    andw 14(%rcx), %ax
+; CHECK-BASELINE-NEXT:    andw 8(%r10), %ax
+; CHECK-BASELINE-NEXT:    xorl %ebx, %eax
+; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 10(%r9), %ebx
+; CHECK-BASELINE-NEXT:    xorw %r14w, %bx
+; CHECK-BASELINE-NEXT:    andw 10(%r10), %bx
+; CHECK-BASELINE-NEXT:    xorl %r14d, %ebx
+; CHECK-BASELINE-NEXT:    movzwl 12(%r9), %r14d
+; CHECK-BASELINE-NEXT:    xorw %r12w, %r14w
+; CHECK-BASELINE-NEXT:    andw 12(%r10), %r14w
+; CHECK-BASELINE-NEXT:    xorl %r12d, %r14d
+; CHECK-BASELINE-NEXT:    movzwl 14(%r9), %r12d
+; CHECK-BASELINE-NEXT:    xorw %r13w, %r12w
+; CHECK-BASELINE-NEXT:    andw 14(%r10), %r12w
+; CHECK-BASELINE-NEXT:    xorl %r13d, %r12d
+; CHECK-BASELINE-NEXT:    movzwl 16(%r9), %r13d
+; CHECK-BASELINE-NEXT:    xorw %r15w, %r13w
+; CHECK-BASELINE-NEXT:    andw 16(%r10), %r13w
+; CHECK-BASELINE-NEXT:    xorl %r15d, %r13d
+; CHECK-BASELINE-NEXT:    movzwl 18(%r9), %r15d
+; CHECK-BASELINE-NEXT:    xorw %bp, %r15w
+; CHECK-BASELINE-NEXT:    andw 18(%r10), %r15w
+; CHECK-BASELINE-NEXT:    xorl %ebp, %r15d
+; CHECK-BASELINE-NEXT:    movl 20(%r8), %eax
+; CHECK-BASELINE-NEXT:    movzwl 20(%r9), %ebp
+; CHECK-BASELINE-NEXT:    xorw %ax, %bp
+; CHECK-BASELINE-NEXT:    andw 20(%r10), %bp
 ; CHECK-BASELINE-NEXT:    xorl %eax, %ebp
-; CHECK-BASELINE-NEXT:    movzwl 16(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r14w, %ax
-; CHECK-BASELINE-NEXT:    andw 16(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r14d
-; CHECK-BASELINE-NEXT:    movzwl 18(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r15w, %ax
-; CHECK-BASELINE-NEXT:    andw 18(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r15d
-; CHECK-BASELINE-NEXT:    movzwl 20(%rdx), %r13d
-; CHECK-BASELINE-NEXT:    movzwl 20(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r13w, %ax
-; CHECK-BASELINE-NEXT:    andw 20(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r13d
-; CHECK-BASELINE-NEXT:    movzwl 22(%rdx), %r9d
-; CHECK-BASELINE-NEXT:    movzwl 22(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r9w, %ax
-; CHECK-BASELINE-NEXT:    andw 22(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r9d
-; CHECK-BASELINE-NEXT:    movzwl 24(%rdx), %r8d
-; CHECK-BASELINE-NEXT:    movzwl 24(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r8w, %ax
-; CHECK-BASELINE-NEXT:    andw 24(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r8d
-; CHECK-BASELINE-NEXT:    movzwl 26(%rdx), %eax
-; CHECK-BASELINE-NEXT:    movzwl 26(%rsi), %r10d
-; CHECK-BASELINE-NEXT:    xorw %ax, %r10w
-; CHECK-BASELINE-NEXT:    andw 26(%rcx), %r10w
-; CHECK-BASELINE-NEXT:    xorl %r10d, %eax
-; CHECK-BASELINE-NEXT:    movzwl 28(%rdx), %r10d
-; CHECK-BASELINE-NEXT:    movzwl 28(%rsi), %r11d
-; CHECK-BASELINE-NEXT:    xorw %r10w, %r11w
-; CHECK-BASELINE-NEXT:    andw 28(%rcx), %r11w
-; CHECK-BASELINE-NEXT:    xorl %r11d, %r10d
-; CHECK-BASELINE-NEXT:    movzwl 30(%rdx), %edx
-; CHECK-BASELINE-NEXT:    movzwl 30(%rsi), %esi
-; CHECK-BASELINE-NEXT:    xorw %dx, %si
-; CHECK-BASELINE-NEXT:    andw 30(%rcx), %si
-; CHECK-BASELINE-NEXT:    xorl %esi, %edx
-; CHECK-BASELINE-NEXT:    movw %dx, 30(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r10w, 28(%rdi)
-; CHECK-BASELINE-NEXT:    movw %ax, 26(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r8w, 24(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r9w, 22(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r13w, 20(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r15w, 18(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r14w, 16(%rdi)
-; CHECK-BASELINE-NEXT:    movw %bp, 14(%rdi)
-; CHECK-BASELINE-NEXT:    movw %bx, 12(%rdi)
+; CHECK-BASELINE-NEXT:    movzwl 22(%r8), %eax
+; CHECK-BASELINE-NEXT:    movzwl 22(%r9), %esi
+; CHECK-BASELINE-NEXT:    xorw %ax, %si
+; CHECK-BASELINE-NEXT:    andw 22(%r10), %si
+; CHECK-BASELINE-NEXT:    xorl %eax, %esi
+; CHECK-BASELINE-NEXT:    movl 24(%r8), %eax
+; CHECK-BASELINE-NEXT:    movzwl 24(%r9), %edx
+; CHECK-BASELINE-NEXT:    xorw %ax, %dx
+; CHECK-BASELINE-NEXT:    andw 24(%r10), %dx
+; CHECK-BASELINE-NEXT:    xorl %eax, %edx
+; CHECK-BASELINE-NEXT:    movzwl 26(%r8), %eax
+; CHECK-BASELINE-NEXT:    movzwl 26(%r9), %ecx
+; CHECK-BASELINE-NEXT:    xorw %ax, %cx
+; CHECK-BASELINE-NEXT:    andw 26(%r10), %cx
+; CHECK-BASELINE-NEXT:    xorl %eax, %ecx
+; CHECK-BASELINE-NEXT:    movl 28(%r8), %edi
+; CHECK-BASELINE-NEXT:    movzwl 28(%r9), %eax
+; CHECK-BASELINE-NEXT:    xorw %di, %ax
+; CHECK-BASELINE-NEXT:    andw 28(%r10), %ax
+; CHECK-BASELINE-NEXT:    xorl %edi, %eax
+; CHECK-BASELINE-NEXT:    movzwl 30(%r8), %edi
+; CHECK-BASELINE-NEXT:    movzwl 30(%r9), %r8d
+; CHECK-BASELINE-NEXT:    xorw %di, %r8w
+; CHECK-BASELINE-NEXT:    andw 30(%r10), %r8w
+; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
+; CHECK-BASELINE-NEXT:    movw %r8w, 30(%r11)
+; CHECK-BASELINE-NEXT:    movw %ax, 28(%r11)
+; CHECK-BASELINE-NEXT:    movw %cx, 26(%r11)
+; CHECK-BASELINE-NEXT:    movw %dx, 24(%r11)
+; CHECK-BASELINE-NEXT:    movw %si, 22(%r11)
+; CHECK-BASELINE-NEXT:    movw %bp, 20(%r11)
+; CHECK-BASELINE-NEXT:    movw %r15w, 18(%r11)
+; CHECK-BASELINE-NEXT:    movw %r13w, 16(%r11)
+; CHECK-BASELINE-NEXT:    movw %r12w, 14(%r11)
+; CHECK-BASELINE-NEXT:    movw %r14w, 12(%r11)
+; CHECK-BASELINE-NEXT:    movw %bx, 10(%r11)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, 10(%rdi)
+; CHECK-BASELINE-NEXT:    movw %ax, 8(%r11)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, 8(%rdi)
+; CHECK-BASELINE-NEXT:    movw %ax, 6(%r11)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, 6(%rdi)
+; CHECK-BASELINE-NEXT:    movw %ax, 4(%r11)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, 4(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r12w, 2(%rdi)
+; CHECK-BASELINE-NEXT:    movw %ax, 2(%r11)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, (%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    movw %ax, (%r11)
+; CHECK-BASELINE-NEXT:    movq %r11, %rax
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    popq %r12
 ; CHECK-BASELINE-NEXT:    popq %r13
@@ -1882,113 +1876,117 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-SSE1-NEXT:    pushq %r13
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
-; CHECK-SSE1-NEXT:    movzwl 18(%rdx), %r15d
-; CHECK-SSE1-NEXT:    movzwl 16(%rdx), %r14d
-; CHECK-SSE1-NEXT:    movzwl 14(%rdx), %ebp
-; CHECK-SSE1-NEXT:    movzwl 12(%rdx), %ebx
-; CHECK-SSE1-NEXT:    movzwl 10(%rdx), %r13d
-; CHECK-SSE1-NEXT:    movzwl 8(%rdx), %r11d
-; CHECK-SSE1-NEXT:    movzwl 6(%rdx), %r10d
-; CHECK-SSE1-NEXT:    movzwl 4(%rdx), %r9d
-; CHECK-SSE1-NEXT:    movzwl (%rdx), %r8d
-; CHECK-SSE1-NEXT:    movzwl 2(%rdx), %r12d
-; CHECK-SSE1-NEXT:    movzwl (%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r8w, %ax
-; CHECK-SSE1-NEXT:    andw (%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r8d
-; CHECK-SSE1-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 2(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r12w, %ax
-; CHECK-SSE1-NEXT:    andw 2(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r12d
-; CHECK-SSE1-NEXT:    movzwl 4(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r9w, %ax
-; CHECK-SSE1-NEXT:    andw 4(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r9d
-; CHECK-SSE1-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 6(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r10w, %ax
-; CHECK-SSE1-NEXT:    andw 6(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r10d
-; CHECK-SSE1-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 8(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r11w, %ax
-; CHECK-SSE1-NEXT:    andw 8(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r11d
-; CHECK-SSE1-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 10(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r13w, %ax
-; CHECK-SSE1-NEXT:    andw 10(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r13d
-; CHECK-SSE1-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 12(%rsi), %eax
+; CHECK-SSE1-NEXT:    movq %rcx, %r10
+; CHECK-SSE1-NEXT:    movq %rdx, %r8
+; CHECK-SSE1-NEXT:    movq %rsi, %r9
+; CHECK-SSE1-NEXT:    movq %rdi, %r11
+; CHECK-SSE1-NEXT:    movzwl 18(%rdx), %ebp
+; CHECK-SSE1-NEXT:    movl 16(%rdx), %r15d
+; CHECK-SSE1-NEXT:    movzwl 14(%rdx), %r13d
+; CHECK-SSE1-NEXT:    movl 12(%rdx), %r12d
+; CHECK-SSE1-NEXT:    movzwl 10(%rdx), %r14d
+; CHECK-SSE1-NEXT:    movl 8(%rdx), %ebx
+; CHECK-SSE1-NEXT:    movzwl 6(%rdx), %eax
+; CHECK-SSE1-NEXT:    movl (%rdx), %ecx
+; CHECK-SSE1-NEXT:    movl 4(%rdx), %edx
+; CHECK-SSE1-NEXT:    movzwl 2(%r8), %esi
+; CHECK-SSE1-NEXT:    movzwl (%r9), %edi
+; CHECK-SSE1-NEXT:    xorw %cx, %di
+; CHECK-SSE1-NEXT:    andw (%r10), %di
+; CHECK-SSE1-NEXT:    xorl %ecx, %edi
+; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 2(%r9), %ecx
+; CHECK-SSE1-NEXT:    xorw %si, %cx
+; CHECK-SSE1-NEXT:    andw 2(%r10), %cx
+; CHECK-SSE1-NEXT:    xorl %esi, %ecx
+; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 4(%r9), %ecx
+; CHECK-SSE1-NEXT:    xorw %dx, %cx
+; CHECK-SSE1-NEXT:    andw 4(%r10), %cx
+; CHECK-SSE1-NEXT:    xorl %edx, %ecx
+; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 6(%r9), %ecx
+; CHECK-SSE1-NEXT:    xorw %ax, %cx
+; CHECK-SSE1-NEXT:    andw 6(%r10), %cx
+; CHECK-SSE1-NEXT:    xorl %eax, %ecx
+; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 8(%r9), %eax
 ; CHECK-SSE1-NEXT:    xorw %bx, %ax
-; CHECK-SSE1-NEXT:    andw 12(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %ebx
-; CHECK-SSE1-NEXT:    movzwl 14(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %bp, %ax
-; CHECK-SSE1-NEXT:    andw 14(%rcx), %ax
+; CHECK-SSE1-NEXT:    andw 8(%r10), %ax
+; CHECK-SSE1-NEXT:    xorl %ebx, %eax
+; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 10(%r9), %ebx
+; CHECK-SSE1-NEXT:    xorw %r14w, %bx
+; CHECK-SSE1-NEXT:    andw 10(%r10), %bx
+; CHECK-SSE1-NEXT:    xorl %r14d, %ebx
+; CHECK-SSE1-NEXT:    movzwl 12(%r9), %r14d
+; CHECK-SSE1-NEXT:    xorw %r12w, %r14w
+; CHECK-SSE1-NEXT:    andw 12(%r10), %r14w
+; CHECK-SSE1-NEXT:    xorl %r12d, %r14d
+; CHECK-SSE1-NEXT:    movzwl 14(%r9), %r12d
+; CHECK-SSE1-NEXT:    xorw %r13w, %r12w
+; CHECK-SSE1-NEXT:    andw 14(%r10), %r12w
+; CHECK-SSE1-NEXT:    xorl %r13d, %r12d
+; CHECK-SSE1-NEXT:    movzwl 16(%r9), %r13d
+; CHECK-SSE1-NEXT:    xorw %r15w, %r13w
+; CHECK-SSE1-NEXT:    andw 16(%r10), %r13w
+; CHECK-SSE1-NEXT:    xorl %r15d, %r13d
+; CHECK-SSE1-NEXT:    movzwl 18(%r9), %r15d
+; CHECK-SSE1-NEXT:    xorw %bp, %r15w
+; CHECK-SSE1-NEXT:    andw 18(%r10), %r15w
+; CHECK-SSE1-NEXT:    xorl %ebp, %r15d
+; CHECK-SSE1-NEXT:    movl 20(%r8), %eax
+; CHECK-SSE1-NEXT:    movzwl 20(%r9), %ebp
+; CHECK-SSE1-NEXT:    xorw %ax, %bp
+; CHECK-SSE1-NEXT:    andw 20(%r10), %bp
 ; CHECK-SSE1-NEXT:    xorl %eax, %ebp
-; CHECK-SSE1-NEXT:    movzwl 16(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r14w, %ax
-; CHECK-SSE1-NEXT:    andw 16(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r14d
-; CHECK-SSE1-NEXT:    movzwl 18(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r15w, %ax
-; CHECK-SSE1-NEXT:    andw 18(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r15d
-; CHECK-SSE1-NEXT:    movzwl 20(%rdx), %r13d
-; CHECK-SSE1-NEXT:    movzwl 20(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r13w, %ax
-; CHECK-SSE1-NEXT:    andw 20(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r13d
-; CHECK-SSE1-NEXT:    movzwl 22(%rdx), %r9d
-; CHECK-SSE1-NEXT:    movzwl 22(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r9w, %ax
-; CHECK-SSE1-NEXT:    andw 22(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r9d
-; CHECK-SSE1-NEXT:    movzwl 24(%rdx), %r8d
-; CHECK-SSE1-NEXT:    movzwl 24(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r8w, %ax
-; CHECK-SSE1-NEXT:    andw 24(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r8d
-; CHECK-SSE1-NEXT:    movzwl 26(%rdx), %eax
-; CHECK-SSE1-NEXT:    movzwl 26(%rsi), %r10d
-; CHECK-SSE1-NEXT:    xorw %ax, %r10w
-; CHECK-SSE1-NEXT:    andw 26(%rcx), %r10w
-; CHECK-SSE1-NEXT:    xorl %r10d, %eax
-; CHECK-SSE1-NEXT:    movzwl 28(%rdx), %r10d
-; CHECK-SSE1-NEXT:    movzwl 28(%rsi), %r11d
-; CHECK-SSE1-NEXT:    xorw %r10w, %r11w
-; CHECK-SSE1-NEXT:    andw 28(%rcx), %r11w
-; CHECK-SSE1-NEXT:    xorl %r11d, %r10d
-; CHECK-SSE1-NEXT:    movzwl 30(%rdx), %edx
-; CHECK-SSE1-NEXT:    movzwl 30(%rsi), %esi
-; CHECK-SSE1-NEXT:    xorw %dx, %si
-; CHECK-SSE1-NEXT:    andw 30(%rcx), %si
-; CHECK-SSE1-NEXT:    xorl %esi, %edx
-; CHECK-SSE1-NEXT:    movw %dx, 30(%rdi)
-; CHECK-SSE1-NEXT:    movw %r10w, 28(%rdi)
-; CHECK-SSE1-NEXT:    movw %ax, 26(%rdi)
-; CHECK-SSE1-NEXT:    movw %r8w, 24(%rdi)
-; CHECK-SSE1-NEXT:    movw %r9w, 22(%rdi)
-; CHECK-SSE1-NEXT:    movw %r13w, 20(%rdi)
-; CHECK-SSE1-NEXT:    movw %r15w, 18(%rdi)
-; CHECK-SSE1-NEXT:    movw %r14w, 16(%rdi)
-; CHECK-SSE1-NEXT:    movw %bp, 14(%rdi)
-; CHECK-SSE1-NEXT:    movw %bx, 12(%rdi)
+; CHECK-SSE1-NEXT:    movzwl 22(%r8), %eax
+; CHECK-SSE1-NEXT:    movzwl 22(%r9), %esi
+; CHECK-SSE1-NEXT:    xorw %ax, %si
+; CHECK-SSE1-NEXT:    andw 22(%r10), %si
+; CHECK-SSE1-NEXT:    xorl %eax, %esi
+; CHECK-SSE1-NEXT:    movl 24(%r8), %eax
+; CHECK-SSE1-NEXT:    movzwl 24(%r9), %edx
+; CHECK-SSE1-NEXT:    xorw %ax, %dx
+; CHECK-SSE1-NEXT:    andw 24(%r10), %dx
+; CHECK-SSE1-NEXT:    xorl %eax, %edx
+; CHECK-SSE1-NEXT:    movzwl 26(%r8), %eax
+; CHECK-SSE1-NEXT:    movzwl 26(%r9), %ecx
+; CHECK-SSE1-NEXT:    xorw %ax, %cx
+; CHECK-SSE1-NEXT:    andw 26(%r10), %cx
+; CHECK-SSE1-NEXT:    xorl %eax, %ecx
+; CHECK-SSE1-NEXT:    movl 28(%r8), %edi
+; CHECK-SSE1-NEXT:    movzwl 28(%r9), %eax
+; CHECK-SSE1-NEXT:    xorw %di, %ax
+; CHECK-SSE1-NEXT:    andw 28(%r10), %ax
+; CHECK-SSE1-NEXT:    xorl %edi, %eax
+; CHECK-SSE1-NEXT:    movzwl 30(%r8), %edi
+; CHECK-SSE1-NEXT:    movzwl 30(%r9), %r8d
+; CHECK-SSE1-NEXT:    xorw %di, %r8w
+; CHECK-SSE1-NEXT:    andw 30(%r10), %r8w
+; CHECK-SSE1-NEXT:    xorl %edi, %r8d
+; CHECK-SSE1-NEXT:    movw %r8w, 30(%r11)
+; CHECK-SSE1-NEXT:    movw %ax, 28(%r11)
+; CHECK-SSE1-NEXT:    movw %cx, 26(%r11)
+; CHECK-SSE1-NEXT:    movw %dx, 24(%r11)
+; CHECK-SSE1-NEXT:    movw %si, 22(%r11)
+; CHECK-SSE1-NEXT:    movw %bp, 20(%r11)
+; CHECK-SSE1-NEXT:    movw %r15w, 18(%r11)
+; CHECK-SSE1-NEXT:    movw %r13w, 16(%r11)
+; CHECK-SSE1-NEXT:    movw %r12w, 14(%r11)
+; CHECK-SSE1-NEXT:    movw %r14w, 12(%r11)
+; CHECK-SSE1-NEXT:    movw %bx, 10(%r11)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, 10(%rdi)
+; CHECK-SSE1-NEXT:    movw %ax, 8(%r11)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, 8(%rdi)
+; CHECK-SSE1-NEXT:    movw %ax, 6(%r11)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, 6(%rdi)
+; CHECK-SSE1-NEXT:    movw %ax, 4(%r11)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, 4(%rdi)
-; CHECK-SSE1-NEXT:    movw %r12w, 2(%rdi)
+; CHECK-SSE1-NEXT:    movw %ax, 2(%r11)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    movw %ax, (%r11)
+; CHECK-SSE1-NEXT:    movq %r11, %rax
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    popq %r12
 ; CHECK-SSE1-NEXT:    popq %r13
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index cfa71ff..816d5ca 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -174,20 +174,19 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
 ; SSE-NEXT:    pmulhw %xmm3, %xmm2
 ; SSE-NEXT:    psrlw $8, %xmm2
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT:    pmulhw %xmm3, %xmm1
-; SSE-NEXT:    psrlw $8, %xmm1
-; SSE-NEXT:    packuswb %xmm2, %xmm1
-; SSE-NEXT:    paddb %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrlw $2, %xmm1
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; SSE-NEXT:    pxor %xmm2, %xmm1
-; SSE-NEXT:    psrlw $7, %xmm0
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE-NEXT:    pmulhw %xmm3, %xmm4
+; SSE-NEXT:    psrlw $8, %xmm4
+; SSE-NEXT:    packuswb %xmm2, %xmm4
+; SSE-NEXT:    paddb %xmm4, %xmm0
+; SSE-NEXT:    psrlw $2, %xmm0
 ; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    paddb %xmm1, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE-NEXT:    pxor %xmm2, %xmm0
 ; SSE-NEXT:    psubb %xmm2, %xmm0
+; SSE-NEXT:    pcmpgtb %xmm0, %xmm1
+; SSE-NEXT:    psubb %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_div7_16i8:
@@ -197,19 +196,18 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
 ; AVX1-NEXT:    vpmulhw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT:    vpmulhw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
-; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpmulhw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2NOBW-LABEL: test_div7_16i8:
@@ -220,14 +218,14 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
 ; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX2NOBW-NEXT:    vpsrlw $2, %xmm0, %xmm1
-; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2NOBW-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX2NOBW-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; AVX2NOBW-NEXT:    vpsrlw $7, %xmm0, %xmm0
+; AVX2NOBW-NEXT:    vpsrlw $2, %xmm0, %xmm0
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX2NOBW-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
+; AVX2NOBW-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2NOBW-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2NOBW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX2NOBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX2NOBW-NEXT:    vzeroupper
 ; AVX2NOBW-NEXT:    retq
 ;
@@ -238,14 +236,14 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
 ; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT:    vpsrlw $2, %xmm0, %xmm1
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsrlw $7, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpsrlw $2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %res = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
@@ -264,26 +262,25 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
 ; SSE-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
 ; SSE-NEXT:    psrlw $8, %xmm2
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [37632,33024,14592,26368,47872,11008,20224,37632]
-; SSE-NEXT:    psrlw $8, %xmm1
-; SSE-NEXT:    packuswb %xmm2, %xmm1
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; SSE-NEXT:    psrlw $8, %xmm3
+; SSE-NEXT:    packuswb %xmm2, %xmm3
 ; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    paddb %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE-NEXT:    psraw $8, %xmm1
-; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,64,128,32,64,128,128,64]
-; SSE-NEXT:    psrlw $8, %xmm1
+; SSE-NEXT:    paddb %xmm3, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
 ; SSE-NEXT:    psraw $8, %xmm2
-; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,64,128,64,32,128,64,32]
+; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,64,128,32,64,128,128,64]
 ; SSE-NEXT:    psrlw $8, %xmm2
-; SSE-NEXT:    packuswb %xmm1, %xmm2
-; SSE-NEXT:    psrlw $7, %xmm0
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    paddb %xmm2, %xmm0
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    psraw $8, %xmm0
+; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,64,128,64,32,128,64,32]
+; SSE-NEXT:    psrlw $8, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    pcmpgtb %xmm0, %xmm1
+; SSE-NEXT:    psubb %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_divconstant_16i8:
@@ -292,24 +289,23 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [37632,33024,14592,26368,47872,11008,20224,37632]
-; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,64,128,32,64,128,128,64]
-; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [64,64,128,64,32,128,64,32]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [32,64,128,32,64,128,128,64]
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
-; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,64,128,64,32,128,64,32]
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2NOBW-LABEL: test_divconstant_16i8:
@@ -321,14 +317,14 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
 ; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm1
-; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [64,64,128,64,32,128,64,32,32,64,128,32,64,128,128,64]
-; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2NOBW-NEXT:    vpsrlw $7, %xmm0, %xmm0
-; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,64,128,64,32,128,64,32,32,64,128,32,64,128,128,64]
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2NOBW-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2NOBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX2NOBW-NEXT:    vzeroupper
 ; AVX2NOBW-NEXT:    retq
 ;
@@ -341,12 +337,12 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
-; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm2
-; AVX512BW-NEXT:    vpsravw %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT:    vpsrlw $7, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %res = sdiv <16 x i8> %a, <i8 7, i8 8, i8 9, i8 10,i8 11, i8 12, i8 13, i8 14, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9,i8 9, i8 7>
@@ -568,25 +564,24 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
 ; SSE-NEXT:    pmulhw %xmm3, %xmm2
 ; SSE-NEXT:    psrlw $8, %xmm2
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT:    pmulhw %xmm3, %xmm1
-; SSE-NEXT:    psrlw $8, %xmm1
-; SSE-NEXT:    packuswb %xmm2, %xmm1
-; SSE-NEXT:    paddb %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    psrlw $2, %xmm2
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; SSE-NEXT:    pxor %xmm3, %xmm2
-; SSE-NEXT:    psrlw $7, %xmm1
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE-NEXT:    pmulhw %xmm3, %xmm4
+; SSE-NEXT:    psrlw $8, %xmm4
+; SSE-NEXT:    packuswb %xmm2, %xmm4
+; SSE-NEXT:    paddb %xmm0, %xmm4
+; SSE-NEXT:    psrlw $2, %xmm4
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE-NEXT:    pxor %xmm2, %xmm4
+; SSE-NEXT:    psubb %xmm2, %xmm4
+; SSE-NEXT:    pcmpgtb %xmm4, %xmm1
+; SSE-NEXT:    psubb %xmm1, %xmm4
+; SSE-NEXT:    movdqa %xmm4, %xmm1
+; SSE-NEXT:    psllw $3, %xmm1
 ; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    paddb %xmm2, %xmm1
-; SSE-NEXT:    psubb %xmm3, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    psllw $3, %xmm2
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE-NEXT:    psubb %xmm2, %xmm1
-; SSE-NEXT:    paddb %xmm1, %xmm0
+; SSE-NEXT:    psubb %xmm1, %xmm4
+; SSE-NEXT:    paddb %xmm4, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_rem7_16i8:
@@ -596,19 +591,18 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
 ; AVX1-NEXT:    vpmulhw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT:    vpmulhw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm2
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpmulhw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm1
-; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm2
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
@@ -623,14 +617,14 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
-; AVX2NOBW-NEXT:    vpsrlw $2, %xmm1, %xmm2
-; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX2NOBW-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX2NOBW-NEXT:    vpxor %xmm3, %xmm2, %xmm2
-; AVX2NOBW-NEXT:    vpsrlw $7, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpsrlw $2, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2NOBW-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
-; AVX2NOBW-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2NOBW-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2NOBW-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm2
+; AVX2NOBW-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpsllw $3, %xmm1, %xmm2
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX2NOBW-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
@@ -645,14 +639,14 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
 ; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsrlw $2, %xmm1, %xmm2
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT:    vpxor %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpsrlw $7, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpsrlw $2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512BW-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
-; AVX512BW-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsllw $3, %xmm1, %xmm2
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
@@ -675,27 +669,26 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [37632,20224,11008,47872,26368,14592,14592,37632]
 ; SSE2-NEXT:    psrlw $8, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [37632,33024,14592,26368,47872,11008,20224,37632]
-; SSE2-NEXT:    psrlw $8, %xmm2
-; SSE2-NEXT:    packuswb %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; SSE2-NEXT:    psrlw $8, %xmm3
+; SSE2-NEXT:    packuswb %xmm1, %xmm3
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255]
 ; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    paddb %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT:    psraw $8, %xmm2
-; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,64,128,32,64,128,128,64]
-; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    paddb %xmm3, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
 ; SSE2-NEXT:    psraw $8, %xmm3
-; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [64,64,128,64,32,128,64,32]
+; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [32,64,128,32,64,128,128,64]
 ; SSE2-NEXT:    psrlw $8, %xmm3
-; SSE2-NEXT:    packuswb %xmm2, %xmm3
-; SSE2-NEXT:    psrlw $7, %xmm1
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    paddb %xmm3, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [64,64,128,64,32,128,64,32]
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    packuswb %xmm3, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    psubb %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [14,13,12,11,10,9,9,7]
@@ -710,39 +703,38 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
 ;
 ; SSE41-LABEL: test_remconstant_16i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
 ; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [37632,20224,11008,47872,26368,14592,14592,37632]
-; SSE41-NEXT:    psrlw $8, %xmm1
-; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [37632,33024,14592,26368,47872,11008,20224,37632]
-; SSE41-NEXT:    psrlw $8, %xmm2
-; SSE41-NEXT:    packuswb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255]
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    paddb %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE41-NEXT:    psraw $8, %xmm2
-; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,64,128,32,64,128,128,64]
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
 ; SSE41-NEXT:    psrlw $8, %xmm2
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE41-NEXT:    psraw $8, %xmm3
-; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [64,64,128,64,32,128,64,32]
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
 ; SSE41-NEXT:    psrlw $8, %xmm3
 ; SSE41-NEXT:    packuswb %xmm2, %xmm3
-; SSE41-NEXT:    psrlw $7, %xmm1
-; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    paddb %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7]
-; SSE41-NEXT:    psllw $8, %xmm2
-; SSE41-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0]
-; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    por %xmm2, %xmm1
-; SSE41-NEXT:    psubb %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255]
+; SSE41-NEXT:    pand %xmm0, %xmm2
+; SSE41-NEXT:    paddb %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE41-NEXT:    psraw $8, %xmm3
+; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [32,64,128,32,64,128,128,64]
+; SSE41-NEXT:    psrlw $8, %xmm3
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    psraw $8, %xmm2
+; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,64,128,64,32,128,64,32]
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    packuswb %xmm3, %xmm2
+; SSE41-NEXT:    pcmpgtb %xmm2, %xmm1
+; SSE41-NEXT:    psubb %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7]
+; SSE41-NEXT:    psllw $8, %xmm1
+; SSE41-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0]
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE41-NEXT:    por %xmm1, %xmm2
+; SSE41-NEXT:    psubb %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: test_remconstant_16i8:
@@ -751,24 +743,23 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [37632,33024,14592,26368,47872,11008,20224,37632]
-; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [32,64,128,32,64,128,128,64]
-; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vpsraw $8, %xmm3, %xmm3
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [64,64,128,64,32,128,64,32]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm1
-; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; AVX1-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpsraw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [32,64,128,32,64,128,128,64]
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [64,64,128,64,32,128,64,32]
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7]
 ; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0]
@@ -786,14 +777,14 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
 ; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
 ; AVX2NOBW-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
-; AVX2NOBW-NEXT:    vpmovsxbw %xmm1, %ymm2
-; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [64,64,128,64,32,128,64,32,32,64,128,32,64,128,128,64]
-; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2NOBW-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
-; AVX2NOBW-NEXT:    vpsrlw $7, %xmm1, %xmm1
-; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2NOBW-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX2NOBW-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [64,64,128,64,32,128,64,32,32,64,128,32,64,128,128,64]
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2NOBW-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm2
+; AVX2NOBW-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7]
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
@@ -812,12 +803,12 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
 ; AVX512BW-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpmovsxbw %xmm2, %ymm3
-; AVX512BW-NEXT:    vpsravw %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vpmovsxbw %xmm2, %ymm2
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT:    vpsrlw $7, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7]
 ; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
@@ -1127,25 +1118,25 @@ define <16 x i8> @PR143238(<16 x i8> %a0) {
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
 ; SSE-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [26368,47872,11008,20224,37632,35072,33024,30976]
 ; SSE-NEXT:    psrlw $8, %xmm2
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [33024,22016,33024,26368,11008,37632,33024,14592]
-; SSE-NEXT:    psrlw $8, %xmm1
-; SSE-NEXT:    packuswb %xmm2, %xmm1
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [33024,22016,33024,26368,11008,37632,33024,14592]
+; SSE-NEXT:    psrlw $8, %xmm3
+; SSE-NEXT:    packuswb %xmm2, %xmm3
 ; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    paddb %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE-NEXT:    psraw $8, %xmm1
-; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [64,32,128,64,32,32,32,32]
-; SSE-NEXT:    psrlw $8, %xmm1
+; SSE-NEXT:    paddb %xmm3, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
 ; SSE-NEXT:    psraw $8, %xmm2
-; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [256,256,128,128,256,64,64,128]
+; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,32,128,64,32,32,32,32]
 ; SSE-NEXT:    psrlw $8, %xmm2
-; SSE-NEXT:    packuswb %xmm1, %xmm2
-; SSE-NEXT:    psrlw $7, %xmm0
-; SSE-NEXT:    paddb %xmm2, %xmm0
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    psraw $8, %xmm0
+; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,256,128,128,256,64,64,128]
+; SSE-NEXT:    psrlw $8, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    pcmpgtb %xmm0, %xmm1
+; SSE-NEXT:    psubb %xmm1, %xmm0
 ; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
@@ -1155,23 +1146,23 @@ define <16 x i8> @PR143238(<16 x i8> %a0) {
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [26368,47872,11008,20224,37632,35072,33024,30976]
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [33024,22016,33024,26368,11008,37632,33024,14592]
-; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [33024,22016,33024,26368,11008,37632,33024,14592]
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [64,32,128,64,32,32,32,32]
-; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [256,256,128,128,256,64,64,128]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [64,32,128,64,32,32,32,32]
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,256,128,128,256,64,64,128]
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -1184,13 +1175,14 @@ define <16 x i8> @PR143238(<16 x i8> %a0) {
 ; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm1
-; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,128,128,256,64,64,128,64,32,128,64,32,32,32,32]
-; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2NOBW-NEXT:    vpsrlw $7, %xmm0, %xmm0
-; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,256,128,128,256,64,64,128,64,32,128,64,32,32,32,32]
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2NOBW-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2NOBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
+; AVX2NOBW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2NOBW-NEXT:    vzeroupper
 ; AVX2NOBW-NEXT:    retq
@@ -1204,11 +1196,12 @@ define <16 x i8> @PR143238(<16 x i8> %a0) {
 ; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
-; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm2
-; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT:    vpsrlw $7, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
index 3b9ac63..63c69e5 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
@@ -169,31 +169,28 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
 ; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm5, %xmm3
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm3
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX1-NEXT:    vpxor %xmm7, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubb %xmm7, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; AVX1-NEXT:    vpmulhw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX1-NEXT:    vpmulhw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm2
-; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm6
+; AVX1-NEXT:    vpsubb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; AVX1-NEXT:    vpmulhw %xmm4, %xmm6, %xmm6
+; AVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX1-NEXT:    vpmulhw %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpackuswb %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm6, %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm7, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubb %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -204,19 +201,18 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
 ; AVX2NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
 ; AVX2NOBW-NEXT:    vpmulhw %ymm3, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX2NOBW-NEXT:    vpmulhw %ymm3, %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
-; AVX2NOBW-NEXT:    vpsrlw $2, %ymm0, %ymm1
-; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX2NOBW-NEXT:    vpxor %ymm2, %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpsrlw $7, %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX2NOBW-NEXT:    vpmulhw %ymm3, %ymm4, %ymm3
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpackuswb %ymm2, %ymm3, %ymm2
+; AVX2NOBW-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
+; AVX2NOBW-NEXT:    vpsrlw $2, %ymm0, %ymm0
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2NOBW-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX2NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2NOBW-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; AVX2NOBW-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX2NOBW-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_div7_32i8:
@@ -226,14 +222,14 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
 ; AVX512BW-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
-; AVX512BW-NEXT:    vpsrlw $2, %ymm0, %ymm1
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT:    vpxor %ymm2, %ymm1, %ymm1
-; AVX512BW-NEXT:    vpsrlw $7, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpsrlw $2, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512BW-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
-; AVX512BW-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    retq
   %res = sdiv <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
   ret <32 x i8> %res
@@ -246,52 +242,49 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
 define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
 ; AVX1-LABEL: test_divconstant_32i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
 ; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37632,20224,11008,47872,26368,14592,33024,37632]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [47872,12544,26368,6912,14592,30976,33024,35072]
-; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX1-NEXT:    vpsraw $8, %xmm4, %xmm4
 ; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [32,64,128,32,64,128,64,64]
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vpsraw $8, %xmm5, %xmm5
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [16,64,32,128,64,32,32,32]
-; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT:    vpackuswb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddb %xmm1, %xmm4, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [16,64,32,128,64,32,32,32]
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm1, %xmm4
+; AVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [35072,33024,30976,14592,6912,26368,12544,47872]
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [37632,33024,14592,26368,47872,11008,20224,37632]
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [32,32,32,64,128,32,64,16]
-; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX1-NEXT:    vpsraw $8, %xmm3, %xmm3
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [64,64,128,64,32,128,64,32]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [32,32,32,64,128,32,64,16]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,64,128,64,32,128,64,32]
+; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2NOBW-LABEL: test_divconstant_32i8:
@@ -300,24 +293,23 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
 ; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 ; AVX2NOBW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [35072,33024,30976,14592,6912,26368,12544,47872,37632,20224,11008,47872,26368,14592,33024,37632]
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX2NOBW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37632,33024,14592,26368,47872,11008,20224,37632,47872,12544,26368,6912,14592,30976,33024,35072]
-; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX2NOBW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37632,33024,14592,26368,47872,11008,20224,37632,47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpackuswb %ymm2, %ymm3, %ymm2
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2NOBW-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
-; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2NOBW-NEXT:    vpsraw $8, %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [32,32,32,64,128,32,64,16,32,64,128,32,64,128,64,64]
-; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2NOBW-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
+; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2NOBW-NEXT:    vpsraw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [64,64,128,64,32,128,64,32,16,64,32,128,64,32,32,32]
+; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [32,32,32,64,128,32,64,16,32,64,128,32,64,128,64,64]
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT:    vpackuswb %ymm1, %ymm2, %ymm1
-; AVX2NOBW-NEXT:    vpsrlw $7, %ymm0, %ymm0
-; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2NOBW-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2NOBW-NEXT:    vpsraw $8, %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,64,128,64,32,128,64,32,16,64,32,128,64,32,32,32]
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX2NOBW-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX2NOBW-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_divconstant_32i8:
@@ -328,12 +320,12 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpsrlw $7, %ymm0, %ymm1
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
 ; AVX512BW-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    retq
   %res = sdiv <32 x i8> %a, <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7>
   ret <32 x i8> %res
@@ -544,52 +536,49 @@ define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
 define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
 ; AVX1-LABEL: test_rem7_32i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [37632,37632,37632,37632,37632,37632,37632,37632]
 ; AVX1-NEXT:    vpmulhw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; AVX1-NEXT:    vpmulhw %xmm4, %xmm5, %xmm5
 ; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm5, %xmm3
-; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm3
-; AVX1-NEXT:    vpsrlw $7, %xmm3, %xmm5
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsrlw $2, %xmm3, %xmm3
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm8 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX1-NEXT:    vpxor %xmm3, %xmm8, %xmm3
-; AVX1-NEXT:    vpaddb %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubb %xmm8, %xmm3, %xmm3
-; AVX1-NEXT:    vpsllw $3, %xmm3, %xmm5
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm9 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
-; AVX1-NEXT:    vpand %xmm5, %xmm9, %xmm5
-; AVX1-NEXT:    vpsubb %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX1-NEXT:    vpxor %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubb %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm1, %xmm7
+; AVX1-NEXT:    vpsubb %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vpsllw $3, %xmm3, %xmm7
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
+; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm7
+; AVX1-NEXT:    vpsubb %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; AVX1-NEXT:    vpmulhw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX1-NEXT:    vpmulhw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrlw $7, %xmm2, %xmm3
-; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
-; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
-; AVX1-NEXT:    vpxor %xmm2, %xmm8, %xmm2
-; AVX1-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubb %xmm8, %xmm2, %xmm2
-; AVX1-NEXT:    vpsllw $3, %xmm2, %xmm3
-; AVX1-NEXT:    vpand %xmm3, %xmm9, %xmm3
-; AVX1-NEXT:    vpsubb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpmulhw %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpaddb %xmm0, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $2, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubb %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm8, %xmm3
+; AVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2NOBW-LABEL: test_rem7_32i8:
@@ -599,19 +588,18 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
 ; AVX2NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
 ; AVX2NOBW-NEXT:    vpmulhw %ymm3, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX2NOBW-NEXT:    vpmulhw %ymm3, %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpaddb %ymm0, %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpsrlw $2, %ymm1, %ymm2
+; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX2NOBW-NEXT:    vpmulhw %ymm3, %ymm4, %ymm3
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpackuswb %ymm2, %ymm3, %ymm2
+; AVX2NOBW-NEXT:    vpaddb %ymm0, %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpsrlw $2, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; AVX2NOBW-NEXT:    vpxor %ymm3, %ymm2, %ymm2
-; AVX2NOBW-NEXT:    vpsrlw $7, %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
-; AVX2NOBW-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpsubb %ymm3, %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpcmpgtb %ymm2, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpsubb %ymm1, %ymm2, %ymm1
 ; AVX2NOBW-NEXT:    vpsllw $3, %ymm1, %ymm2
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
@@ -625,14 +613,14 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
 ; AVX512BW-NEXT:    vpaddb %ymm0, %ymm1, %ymm1
-; AVX512BW-NEXT:    vpsrlw $2, %ymm1, %ymm2
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT:    vpxor %ymm3, %ymm2, %ymm2
-; AVX512BW-NEXT:    vpsrlw $7, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpsrlw $2, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512BW-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
-; AVX512BW-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpcmpgtb %ymm1, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpsllw $3, %ymm1, %ymm2
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; AVX512BW-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
@@ -665,18 +653,16 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
 ; AVX1-NEXT:    vpsraw $8, %xmm5, %xmm5
 ; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [32,64,128,32,64,128,64,64]
 ; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vpsraw $8, %xmm6, %xmm6
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 # [16,64,32,128,64,32,32,32]
-; AVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpsrlw $7, %xmm3, %xmm3
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddb %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpsraw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [16,64,32,128,64,32,32,32]
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm1, %xmm5
+; AVX1-NEXT:    vpsubb %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm5 # [22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
 ; AVX1-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
 ; AVX1-NEXT:    vpsllw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpor %xmm3, %xmm5, %xmm3
@@ -684,25 +670,24 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [35072,33024,30976,14592,6912,26368,12544,47872]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [37632,33024,14592,26368,47872,11008,20224,37632]
-; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpsraw $8, %xmm3, %xmm3
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [32,32,32,64,128,32,64,16]
-; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpaddb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX1-NEXT:    vpsraw $8, %xmm4, %xmm4
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [64,64,128,64,32,128,64,32]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [32,32,32,64,128,32,64,16]
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpsraw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [64,64,128,64,32,128,64,32]
+; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0]
-; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
 ; AVX1-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22]
 ; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm1
 ; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
@@ -716,24 +701,23 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
 ; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 ; AVX2NOBW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [35072,33024,30976,14592,6912,26368,12544,47872,37632,20224,11008,47872,26368,14592,33024,37632]
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX2NOBW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37632,33024,14592,26368,47872,11008,20224,37632,47872,12544,26368,6912,14592,30976,33024,35072]
-; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; AVX2NOBW-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2NOBW-NEXT:    vpsraw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [32,32,32,64,128,32,64,16,32,64,128,32,64,128,64,64]
-; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX2NOBW-NEXT:    vpsraw $8, %ymm3, %ymm3
-; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [64,64,128,64,32,128,64,32,16,64,32,128,64,32,32,32]
+; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX2NOBW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37632,33024,14592,26368,47872,11008,20224,37632,47872,12544,26368,6912,14592,30976,33024,35072]
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX2NOBW-NEXT:    vpackuswb %ymm2, %ymm3, %ymm2
-; AVX2NOBW-NEXT:    vpsrlw $7, %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2NOBW-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
+; AVX2NOBW-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX2NOBW-NEXT:    vpsraw $8, %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [32,32,32,64,128,32,64,16,32,64,128,32,64,128,64,64]
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2NOBW-NEXT:    vpsraw $8, %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [64,64,128,64,32,128,64,32,16,64,32,128,64,32,32,32]
+; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpackuswb %ymm3, %ymm2, %ymm2
+; AVX2NOBW-NEXT:    vpcmpgtb %ymm2, %ymm1, %ymm1
+; AVX2NOBW-NEXT:    vpsubb %ymm1, %ymm2, %ymm1
 ; AVX2NOBW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
 ; AVX2NOBW-NEXT:    vpsllw $8, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
@@ -750,12 +734,12 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
 ; AVX512BW-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
-; AVX512BW-NEXT:    vpsrlw $7, %ymm1, %ymm2
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
 ; AVX512BW-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpcmpgtb %ymm1, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
 ; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7]
 ; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
index 6256c4f..0ea7548 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
@@ -140,31 +140,28 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpackuswb %ymm3, %ymm5, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT:    vpsrlw $7, %ymm1, %ymm3
-; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512F-NEXT:    vpand %ymm5, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpsrlw $2, %ymm1, %ymm1
-; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX512F-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512F-NEXT:    vpxor %ymm7, %ymm1, %ymm1
-; AVX512F-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsubb %ymm7, %ymm1, %ymm1
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
-; AVX512F-NEXT:    vpmulhw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
-; AVX512F-NEXT:    vpmulhw %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT:    vpackuswb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm2
-; AVX512F-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512F-NEXT:    vpxor %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm2, %ymm6
+; AVX512F-NEXT:    vpsubb %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512F-NEXT:    vpmulhw %ymm4, %ymm6, %ymm6
+; AVX512F-NEXT:    vpsrlw $8, %ymm6, %ymm6
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm7 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX512F-NEXT:    vpmulhw %ymm4, %ymm7, %ymm4
+; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpackuswb %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm0, %ymm4, %ymm0
 ; AVX512F-NEXT:    vpsrlw $2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpand %ymm6, %ymm0, %ymm0
-; AVX512F-NEXT:    vpxor %ymm7, %ymm0, %ymm0
-; AVX512F-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsubb %ymm7, %ymm0, %ymm0
+; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsubb %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -180,13 +177,13 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm2 ^ (zmm1 & mem)
-; AVX512BW-NEXT:    vpsrlw $7, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpsubb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
+; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %res = sdiv <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
   ret <64 x i8> %res
@@ -199,52 +196,49 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
 define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-LABEL: test_divconstant_64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
 ; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [35072,18176,37632,4864,20224,10496,11008,45824,37632,20224,11008,47872,26368,14592,33024,37632]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [6912,28416,14592,15104,30976,32000,33024,34048,47872,12544,26368,6912,14592,30976,33024,35072]
-; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
+; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [6912,28416,14592,15104,30976,32000,33024,34048,47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-NEXT:    vpaddb %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512F-NEXT:    vpaddb %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vpsraw $8, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [16,32,16,128,32,64,64,16,32,64,128,32,64,128,64,64]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512F-NEXT:    vpsraw $8, %ymm5, %ymm5
-; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [64,16,32,32,16,16,16,16,16,64,32,128,64,32,32,32]
-; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT:    vpackuswb %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT:    vpsrlw $7, %ymm1, %ymm1
-; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT:    vpaddb %ymm1, %ymm4, %ymm1
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512F-NEXT:    vpsraw $8, %ymm2, %ymm2
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [64,16,32,32,16,16,16,16,16,64,32,128,64,32,32,32]
+; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT:    vpackuswb %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm1, %ymm4
+; AVX512F-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 ; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [35072,33024,30976,14592,6912,26368,12544,47872,34048,33024,32000,30976,15104,14592,28416,6912]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 ; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [37632,33024,14592,26368,47872,11008,20224,37632,45824,11008,10496,20224,4864,37632,18176,35072]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT:    vpsraw $8, %ymm2, %ymm2
-; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [32,32,32,64,128,32,64,16,16,16,16,16,32,32,16,64]
-; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vpsraw $8, %ymm3, %ymm3
-; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [64,64,128,64,32,128,64,32,16,64,64,32,128,16,32,16]
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [32,32,32,64,128,32,64,16,16,16,16,16,32,32,16,64]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT:    vpackuswb %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm0
-; AVX512F-NEXT:    vpand %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512F-NEXT:    vpsraw $8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,64,128,64,32,128,64,32,16,64,64,32,128,16,32,16]
+; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_divconstant_64i8:
@@ -263,14 +257,14 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsraw $8, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm2 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; AVX512BW-NEXT:    vpsraw $8, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT:    vpsrlw $7, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpsraw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %res = sdiv <64 x i8> %a, <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7>
   ret <64 x i8> %res
@@ -447,52 +441,49 @@ define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
 define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-LABEL: test_rem7_64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
 ; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
 ; AVX512F-NEXT:    vpmulhw %ymm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
 ; AVX512F-NEXT:    vpmulhw %ymm4, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpackuswb %ymm3, %ymm5, %ymm3
-; AVX512F-NEXT:    vpaddb %ymm1, %ymm3, %ymm3
-; AVX512F-NEXT:    vpsrlw $7, %ymm3, %ymm5
-; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512F-NEXT:    vpand %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpsrlw $2, %ymm3, %ymm3
-; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX512F-NEXT:    vpand %ymm7, %ymm3, %ymm3
-; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm8 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512F-NEXT:    vpxor %ymm3, %ymm8, %ymm3
-; AVX512F-NEXT:    vpaddb %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT:    vpsubb %ymm8, %ymm3, %ymm3
-; AVX512F-NEXT:    vpsllw $3, %ymm3, %ymm5
-; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm9 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
-; AVX512F-NEXT:    vpand %ymm5, %ymm9, %ymm5
-; AVX512F-NEXT:    vpsubb %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT:    vpand %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512F-NEXT:    vpxor %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsubb %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm7
+; AVX512F-NEXT:    vpsubb %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsllw $3, %ymm3, %ymm7
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
+; AVX512F-NEXT:    vpand %ymm7, %ymm8, %ymm7
+; AVX512F-NEXT:    vpsubb %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 ; AVX512F-NEXT:    vpmulhw %ymm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
-; AVX512F-NEXT:    vpmulhw %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT:    vpackuswb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpaddb %ymm0, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsrlw $7, %ymm2, %ymm3
-; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT:    vpsrlw $2, %ymm2, %ymm2
-; AVX512F-NEXT:    vpand %ymm7, %ymm2, %ymm2
-; AVX512F-NEXT:    vpxor %ymm2, %ymm8, %ymm2
-; AVX512F-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsubb %ymm8, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsllw $3, %ymm2, %ymm3
-; AVX512F-NEXT:    vpand %ymm3, %ymm9, %ymm3
-; AVX512F-NEXT:    vpsubb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512F-NEXT:    vpmulhw %ymm4, %ymm7, %ymm4
+; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm0, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $2, %ymm3, %ymm3
+; AVX512F-NEXT:    vpand %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT:    vpxor %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsubb %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsubb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpsllw $3, %ymm1, %ymm3
+; AVX512F-NEXT:    vpand %ymm3, %ymm8, %ymm3
+; AVX512F-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_rem7_64i8:
@@ -507,13 +498,13 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpsrlw $2, %zmm1, %zmm2
-; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm2 = zmm3 ^ (zmm2 & mem)
-; AVX512BW-NEXT:    vpsrlw $7, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT:    vpsubb %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlw $2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm2 ^ (zmm1 & mem)
+; AVX512BW-NEXT:    vpsubb %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512BW-NEXT:    vpsubb %zmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpsllw $3, %zmm1, %zmm2
 ; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpsubb %zmm2, %zmm1, %zmm1
@@ -546,18 +537,16 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpsraw $8, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [16,32,16,128,32,64,64,16,32,64,128,32,64,128,64,64]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512F-NEXT:    vpsraw $8, %ymm6, %ymm6
-; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 # [64,16,32,32,16,16,16,16,16,64,32,128,64,32,32,32]
-; AVX512F-NEXT:    vpsrlw $8, %ymm6, %ymm6
-; AVX512F-NEXT:    vpackuswb %ymm5, %ymm6, %ymm5
-; AVX512F-NEXT:    vpsrlw $7, %ymm3, %ymm3
-; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT:    vpaddb %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512F-NEXT:    vpsraw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [64,16,32,32,16,16,16,16,16,64,32,128,64,32,32,32]
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm5
+; AVX512F-NEXT:    vpsubb %ymm5, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 # [38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
-; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT:    vpand %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpand %ymm6, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
 ; AVX512F-NEXT:    vpsllw $8, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpor %ymm3, %ymm5, %ymm3
@@ -565,25 +554,24 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 ; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [35072,33024,30976,14592,6912,26368,12544,47872,34048,33024,32000,30976,15104,14592,28416,6912]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37632,33024,14592,26368,47872,11008,20224,37632,45824,11008,10496,20224,4864,37632,18176,35072]
-; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vpaddb %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT:    vpsraw $8, %ymm3, %ymm3
-; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [32,32,32,64,128,32,64,16,16,16,16,16,32,32,16,64]
-; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [37632,33024,14592,26368,47872,11008,20224,37632,45824,11008,10496,20224,4864,37632,18176,35072]
+; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT:    vpackuswb %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512F-NEXT:    vpsraw $8, %ymm4, %ymm4
-; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [64,64,128,64,32,128,64,32,16,64,64,32,128,16,32,16]
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [32,32,32,64,128,32,64,16,16,16,16,16,32,32,16,64]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT:    vpackuswb %ymm3, %ymm4, %ymm3
-; AVX512F-NEXT:    vpsrlw $7, %ymm1, %ymm1
-; AVX512F-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT:    vpaddb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512F-NEXT:    vpsraw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [64,64,128,64,32,128,64,32,16,64,64,32,128,16,32,16]
+; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpackuswb %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsubb %ymm1, %ymm3, %ymm1
 ; AVX512F-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0]
-; AVX512F-NEXT:    vpand %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38]
 ; AVX512F-NEXT:    vpsllw $8, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
@@ -607,14 +595,14 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsraw $8, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; AVX512BW-NEXT:    vpsraw $8, %zmm3, %zmm3
-; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
-; AVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm3
-; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vpsrlw $7, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpsraw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512BW-NEXT:    vpsubb %zmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0,38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
 ; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38,0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
 ; AVX512BW-NEXT:    vpsllw $8, %zmm1, %zmm1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
index 782a81b..674bad2 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
@@ -740,16 +740,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
 ; AVX512-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
 ; AVX512-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX512-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqa %xmm0, 16(%rcx)
+; AVX512-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX512-NEXT:    vmovdqa %xmm2, 32(%rcx)
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: store_i8_stride3_vf16:
@@ -763,16 +762,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
 ; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
 ; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa %xmm0, 16(%rcx)
+; AVX512-FCP-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX512-FCP-NEXT:    vmovdqa %xmm2, 32(%rcx)
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: store_i8_stride3_vf16:
@@ -786,16 +784,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
 ; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
 ; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512DQ-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vmovdqa %xmm0, 16(%rcx)
+; AVX512DQ-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX512DQ-NEXT:    vmovdqa %xmm2, 32(%rcx)
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: store_i8_stride3_vf16:
@@ -809,16 +806,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
 ; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
 ; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, 16(%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, 32(%rcx)
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
 ; AVX512BW-LABEL: store_i8_stride3_vf16:
@@ -832,16 +828,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
 ; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
 ; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512BW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT:    vmovdqa %xmm0, 16(%rcx)
+; AVX512BW-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX512BW-NEXT:    vmovdqa %xmm2, 32(%rcx)
-; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: store_i8_stride3_vf16:
@@ -855,16 +850,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
 ; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
 ; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512BW-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm0, 16(%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX512BW-FCP-NEXT:    vmovdqa %xmm2, 32(%rcx)
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: store_i8_stride3_vf16:
@@ -878,16 +872,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
 ; AVX512DQ-BW-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
 ; AVX512DQ-BW-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512DQ-BW-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX512DQ-BW-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512DQ-BW-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, 16(%rcx)
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, 32(%rcx)
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: store_i8_stride3_vf16:
@@ -901,16 +894,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
 ; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
 ; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512DQ-BW-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm0, 16(%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm2, 32(%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64
   %in.vec1 = load <16 x i8>, ptr %in.vecptr1, align 64
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index 47a6022..7cddebd 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -962,16 +962,15 @@ define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x
 ; AVX512-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
 ; AVX512-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
 ; AVX512-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX512-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqu %xmm0, 16(%rdi)
+; AVX512-NEXT:    vmovdqu %xmm1, (%rdi)
 ; AVX512-NEXT:    vmovdqu %xmm2, 32(%rdi)
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqu %ymm0, (%rdi)
-; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 %2 = shufflevector <16 x i8> %c, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll
index d1655bf..c4016e7 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll
@@ -4,12 +4,6 @@
 ; Handled strictly:
 ; - i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
 ; - i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
-; - <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
-; - <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
-; - <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 0, i64 2>)
-; - <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1)
-; - <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a2)
-; - <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1)
 ; - void @llvm.x86.avx.vzeroall()
 ; - void @llvm.x86.avx.vzeroupper()
 
@@ -956,19 +950,19 @@ define <2 x double> @test_x86_avx_vpermilvar_pd(<2 x double> %a0, <2 x i64> %a1)
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    [[A0:%.*]] = bitcast <2 x i64> [[TMP1]] to <2 x double>
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> [[A0]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[RES]] to <2 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> [[A0:%.*]], <2 x i64> [[A1:%.*]])
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
+; CHECK:       8:
+; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> [[A2:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES1]]
 ;
   %res = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
@@ -981,19 +975,19 @@ define <4 x double> @test_x86_avx_vpermilvar_pd_256(<4 x double> %a0, <4 x i64>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    [[A0:%.*]] = bitcast <4 x i64> [[TMP1]] to <4 x double>
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[A0]], <4 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double> [[RES]] to <4 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[A0:%.*]], <4 x i64> [[A1:%.*]])
-; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x double> [[RES]]
+; CHECK:       8:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[A2:%.*]], <4 x i64> [[A1]])
+; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES1]]
 ;
   %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1) ; <<4 x double>> [#uses=1]
   ret <4 x double> %res
@@ -1004,16 +998,12 @@ define <4 x double> @test_x86_avx_vpermilvar_pd_256_2(<4 x double> %a0) #0 {
 ; CHECK-LABEL: @test_x86_avx_vpermilvar_pd_256_2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn()
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[A0:%.*]], <4 x i64> <i64 2, i64 0, i64 0, i64 2>)
-; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x double> [[RES]]
+; CHECK-NEXT:    [[A0:%.*]] = bitcast <4 x i64> [[TMP1]] to <4 x double>
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[A0]], <4 x i64> <i64 2, i64 0, i64 0, i64 2>)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double> [[RES]] to <4 x i64>
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[A1:%.*]], <4 x i64> <i64 2, i64 0, i64 0, i64 2>)
+; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES1]]
 ;
   %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 0, i64 2>) ; <<4 x double>> [#uses=1]
   ret <4 x double> %res
@@ -1024,19 +1014,19 @@ define <4 x float> @test_x86_avx_vpermilvar_ps(<4 x float> %a0, <4 x i32> %a1) #
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    [[A0:%.*]] = bitcast <4 x i32> [[TMP1]] to <4 x float>
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A0]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[RES]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A0:%.*]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
+; CHECK:       8:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A2:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES1]]
 ;
   %res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
@@ -1057,19 +1047,19 @@ define <4 x float> @test_x86_avx_vpermilvar_ps_load(<4 x float> %a0, ptr %a1) #0
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
+; CHECK-NEXT:    [[A0:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float>
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A0]], <4 x i32> [[A2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x float> [[RES]] to <4 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK:       12:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A0:%.*]], <4 x i32> [[A2]])
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
+; CHECK:       13:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A3:%.*]], <4 x i32> [[A2]])
+; CHECK-NEXT:    store <4 x i32> [[TMP10]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES1]]
 ;
   %a2 = load <4 x i32>, ptr %a1
   %res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a2) ; <<4 x float>> [#uses=1]
@@ -1083,19 +1073,19 @@ define <8 x float> @test_x86_avx_vpermilvar_ps_256(<8 x float> %a0, <8 x i32> %a
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    [[A0:%.*]] = bitcast <8 x i32> [[TMP1]] to <8 x float>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> [[A0]], <8 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x float> [[RES]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> [[A0:%.*]], <8 x i32> [[A1:%.*]])
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x float> [[RES]]
+; CHECK:       8:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> [[A2:%.*]], <8 x i32> [[A1]])
+; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES1]]
 ;
   %res = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1) ; <<8 x float>> [#uses=1]
   ret <8 x float> %res
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
index 2ce2c07..5aeaa12 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
@@ -8143,19 +8143,19 @@ define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    [[X0:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x double> [[TMP7]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP7]]
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X2:%.*]], <8 x i64> [[X1]])
+; CHECK-NEXT:    store <8 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP5]]
 ;
   %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
   ret <8 x double> %res
@@ -8169,27 +8169,27 @@ define <8 x double>@test_int_x86_avx512_mask_vpermilvar_pd_512(<8 x double> %x0,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    [[X0:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X4:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP6]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x double> [[TMP7]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i64> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP17]], <8 x i64> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP9]], <8 x double> [[X2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP7]], <8 x double> [[X2]]
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x double> [[TMP18]]
 ;
@@ -8204,26 +8204,26 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermilvar_pd_512(<8 x double> %x0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    [[X0:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X2:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP6]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP6]], <8 x double> zeroinitializer
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x double> [[TMP16]]
 ;
@@ -8238,19 +8238,19 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x float> [[TMP7]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X2:%.*]], <16 x i32> [[X1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP5]]
 ;
   %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
   ret <16 x float> %res
@@ -8264,27 +8264,27 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512(<16 x float> %x0,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X4:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP6]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP7]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[X2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP7]], <16 x float> [[X2]]
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP18]]
 ;
@@ -8300,26 +8300,26 @@ define <16 x float>@test_int_x86_avx512_maskz_vpermilvar_ps_512(<16 x float> %x0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X2:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP5]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP6]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP6]], <16 x float> zeroinitializer
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP16]]
 ;
@@ -8335,52 +8335,40 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>)
+; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x float> [[TMP7]] to <16 x i32>
+; CHECK-NEXT:    [[TMP33:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X1:%.*]], <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>)
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> zeroinitializer, <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x float> [[TMP7]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP6]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x float> [[TMP33]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP3]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP15]], <16 x i32> [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP9]], <16 x float> [[TMP7]], <16 x float> [[X2]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP17]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]]
-; CHECK:       18:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       19:
-; CHECK-NEXT:    [[TMP20:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>)
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP9]], <16 x float> [[TMP33]], <16 x float> [[X2]]
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[TMP17]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>)
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x float> [[TMP18]] to <16 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X1]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>)
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i16 [[X3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP23:%.*]] = select <16 x i1> [[TMP22]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = select <16 x i1> [[TMP22]], <16 x i32> [[TMP19]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x float> [[TMP20]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP25:%.*]] = xor <16 x i32> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = or <16 x i32> [[TMP25]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = or <16 x i32> [[TMP25]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = or <16 x i32> [[TMP26]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[TMP27]], <16 x i32> [[TMP23]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = select <16 x i1> [[TMP22]], <16 x float> [[TMP20]], <16 x float> zeroinitializer
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP29]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
-; CHECK:       30:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       31:
-; CHECK-NEXT:    [[TMP32:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP30:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[TMP29]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <16 x float> [[TMP30]] to <16 x i32>
+; CHECK-NEXT:    [[TMP32:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X1]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], [[_MSPROP_SELECT1]]
 ; CHECK-NEXT:    [[RES3:%.*]] = fadd <16 x float> [[TMP16]], [[TMP28]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> zeroinitializer, [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP31]], [[_MSPROP]]
 ; CHECK-NEXT:    [[RES4:%.*]] = fadd <16 x float> [[TMP32]], [[RES3]]
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[RES4]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
index d6fa08c..1644a5e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
@@ -8657,19 +8657,19 @@ define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    [[X0:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x double> [[RES]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[RES]]
+; CHECK:       8:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X2:%.*]], <8 x i64> [[X1]])
+; CHECK-NEXT:    store <8 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[RES1]]
 ;
   %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
   ret <8 x double> %res
@@ -8682,27 +8682,27 @@ define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_mask(<8 x double> %x0,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    [[X0:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x double> [[RES]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X3:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x double> [[RES]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP6]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x double> [[RES1]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP10]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x double> [[RES]], <8 x double> [[X2]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x double> [[RES1]], <8 x double> [[X2]]
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x double> [[RES2]]
 ;
@@ -8718,26 +8718,26 @@ define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_maskz(<8 x double> %x0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    [[X0:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x double> [[RES]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X2:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x double> [[RES]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x double> [[RES1]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP13]], <8 x i64> [[TMP9]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x double> [[RES]], <8 x double> zeroinitializer
+; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x double> [[RES1]], <8 x double> zeroinitializer
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x double> [[RES2]]
 ;
@@ -8754,19 +8754,19 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES]]
+; CHECK:       8:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X2:%.*]], <16 x i32> [[X1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES1]]
 ;
   %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
   ret <16 x float> %res
@@ -8779,27 +8779,27 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_mask(<16 x float> %x0,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X3:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP6]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x float> [[RES1]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP10]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> [[X2]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES1]], <16 x float> [[X2]]
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[RES2]]
 ;
@@ -8815,26 +8815,26 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_maskz(<16 x float> %x0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X2:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP5]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[RES1]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP13]], <16 x i32> [[TMP9]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> zeroinitializer
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES1]], <16 x float> zeroinitializer
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[RES2]]
 ;
@@ -8848,16 +8848,12 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool(<16 x fl
 ; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_constant_pool(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES]]
+; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X1:%.*]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
+; CHECK-NEXT:    store <16 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES1]]
 ;
   %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
   ret <16 x float> %res
@@ -8869,24 +8865,20 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask(<16
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
+; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X1:%.*]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
 ; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP6]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[RES1]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP3]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> [[X2]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES1]], <16 x float> [[X2]]
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[RES2]]
 ;
@@ -8901,23 +8893,19 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz(<1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
+; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X1:%.*]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
 ; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP5]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[RES1]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> zeroinitializer
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES1]], <16 x float> zeroinitializer
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[RES2]]
 ;
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll
index 06c62dd..613146ff 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll
@@ -4,12 +4,6 @@
 ; Handled strictly:
 ; - i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
 ; - i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
-; - <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
-; - <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
-; - <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 0, i64 2>)
-; - <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1)
-; - <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a2)
-; - <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1)
 ; - void @llvm.x86.avx.vzeroall()
 ; - void @llvm.x86.avx.vzeroupper()
 
@@ -996,19 +990,19 @@ define <2 x double> @test_x86_avx_vpermilvar_pd(<2 x double> %a0, <2 x i64> %a1)
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    [[A0:%.*]] = bitcast <2 x i64> [[TMP1]] to <2 x double>
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> [[A0]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x double> [[RES]] to <2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> [[A0:%.*]], <2 x i64> [[A1:%.*]])
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
+; CHECK:       9:
+; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> [[A2:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES1]]
 ;
   %res = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
@@ -1022,19 +1016,19 @@ define <4 x double> @test_x86_avx_vpermilvar_pd_256(<4 x double> %a0, <4 x i64>
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    [[A0:%.*]] = bitcast <4 x i64> [[TMP1]] to <4 x double>
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[A0]], <4 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double> [[RES]] to <4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[A0:%.*]], <4 x i64> [[A1:%.*]])
-; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x double> [[RES]]
+; CHECK:       9:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[A2:%.*]], <4 x i64> [[A1]])
+; CHECK-NEXT:    store <4 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES1]]
 ;
   %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1) ; <<4 x double>> [#uses=1]
   ret <4 x double> %res
@@ -1046,16 +1040,12 @@ define <4 x double> @test_x86_avx_vpermilvar_pd_256_2(<4 x double> %a0) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn()
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[A0:%.*]], <4 x i64> <i64 2, i64 0, i64 0, i64 2>)
-; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x double> [[RES]]
+; CHECK-NEXT:    [[A0:%.*]] = bitcast <4 x i64> [[TMP1]] to <4 x double>
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[A0]], <4 x i64> <i64 2, i64 0, i64 0, i64 2>)
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[RES]] to <4 x i64>
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[A1:%.*]], <4 x i64> <i64 2, i64 0, i64 0, i64 2>)
+; CHECK-NEXT:    store <4 x i64> [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES1]]
 ;
   %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 0, i64 2>) ; <<4 x double>> [#uses=1]
   ret <4 x double> %res
@@ -1067,19 +1057,19 @@ define <4 x float> @test_x86_avx_vpermilvar_ps(<4 x float> %a0, <4 x i32> %a1) #
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    [[A0:%.*]] = bitcast <4 x i32> [[TMP1]] to <4 x float>
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A0]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[RES]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A0:%.*]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
+; CHECK:       9:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A2:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES1]]
 ;
   %res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
@@ -1091,7 +1081,7 @@ define <4 x float> @test_x86_avx_vpermilvar_ps_load(<4 x float> %a0, ptr %a1) #0
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
@@ -1101,19 +1091,19 @@ define <4 x float> @test_x86_avx_vpermilvar_ps_load(<4 x float> %a0, ptr %a1) #0
 ; CHECK-NEXT:    [[TMP6:%.*]] = and i64 [[TMP5]], -2147483649
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
+; CHECK-NEXT:    [[A0:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float>
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A0]], <4 x i32> [[A2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x float> [[RES]] to <4 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       12:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A0:%.*]], <4 x i32> [[A2]])
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
+; CHECK:       14:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A3:%.*]], <4 x i32> [[A2]])
+; CHECK-NEXT:    store <4 x i32> [[TMP11]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES1]]
 ;
   %a2 = load <4 x i32>, ptr %a1
   %res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a2) ; <<4 x float>> [#uses=1]
@@ -1128,19 +1118,19 @@ define <8 x float> @test_x86_avx_vpermilvar_ps_256(<8 x float> %a0, <8 x i32> %a
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    [[A0:%.*]] = bitcast <8 x i32> [[TMP1]] to <8 x float>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> [[A0]], <8 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x float> [[RES]] to <8 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> [[A0:%.*]], <8 x i32> [[A1:%.*]])
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x float> [[RES]]
+; CHECK:       9:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> [[A2:%.*]], <8 x i32> [[A1]])
+; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES1]]
 ;
   %res = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1) ; <<8 x float>> [#uses=1]
   ret <8 x float> %res
diff --git a/llvm/test/MC/RISCV/xqcibi-long-conditional-jump.s b/llvm/test/MC/RISCV/xqcibi-long-conditional-jump.s
index aab664f..0279c81 100644
--- a/llvm/test/MC/RISCV/xqcibi-long-conditional-jump.s
+++ b/llvm/test/MC/RISCV/xqcibi-long-conditional-jump.s
@@ -64,59 +64,115 @@ test:
 .L6:
    ret
 
-# CHECK-INST:         qc.e.beqi    a0, 0x1, 0x7a26
+# CHECK-INST:         qc.e.beqi    a0, 0x51, 0x7a26
 # CHECK-INST-NEXT:    jal     zero, 0x8e76
-# CHECK-INST-RELAX:         qc.e.beqi    a0, 0x1, 0x7a26
+# CHECK-INST-RELAX:         qc.e.beqi    a0, 0x51, 0x7a26
 # CHECK-INST-RELAX-NEXT:    jal     zero, {{.*}}
-   qc.e.bnei a0, 1, .L7
+   qc.e.bnei a0, 81, .L7
 .fill 1300, 4, 0
 .L7:
    ret
 
-# CHECK-INST:         qc.e.bnei    a0, 0x2, 0x8e82
+# CHECK-INST:         qc.e.bnei    a0, 0x3e, 0x8e82
 # CHECK-INST-NEXT:    jal     zero, 0xa2d2
-# CHECK-INST-RELAX:         qc.e.bnei    a0, 0x2, 0x8e82
+# CHECK-INST-RELAX:         qc.e.bnei    a0, 0x3e, 0x8e82
 # CHECK-INST-RELAX-NEXT:    jal     zero, {{.*}}
-   qc.e.beqi a0, 2, .L8
+   qc.e.beqi a0, 62, .L8
 .fill 1300, 4, 0
 .L8:
    ret
 
-# CHECK-INST:         qc.e.bgei    a0, 0x3, 0xa2de
+# CHECK-INST:         qc.e.bgei    a0, 0x5d, 0xa2de
 # CHECK-INST-NEXT:    jal     zero, 0xb72e
-# CHECK-INST-RELAX:         qc.e.bgei    a0, 0x3, 0xa2de
+# CHECK-INST-RELAX:         qc.e.bgei    a0, 0x5d, 0xa2de
 # CHECK-INST-RELAX-NEXT:    jal     zero, {{.*}}
-   qc.e.blti a0, 3, .L9
+   qc.e.blti a0, 93, .L9
 .fill 1300, 4, 0
 .L9:
    ret
 
-# CHECK-INST:         qc.e.blti    a0, 0x4, 0xb73a
+# CHECK-INST:         qc.e.blti    a0, 0x2c, 0xb73a
 # CHECK-INST-NEXT:    jal     zero, 0xcb8a
-# CHECK-INST-RELAX:         qc.e.blti    a0, 0x4, 0xb73a
+# CHECK-INST-RELAX:         qc.e.blti    a0, 0x2c, 0xb73a
 # CHECK-INST-RELAX-NEXT:    jal     zero, {{.*}}
-   qc.e.bgei a0, 4, .L10
+   qc.e.bgei a0, 44, .L10
 .fill 1300, 4, 0
 .L10:
    ret
 
-# CHECK-INST:         qc.e.bgeui    a0, 0x5, 0xcb96
+# CHECK-INST:         qc.e.bgeui    a0, 0x37, 0xcb96
 # CHECK-INST-NEXT:    jal     zero, 0xdfe6
-# CHECK-INST-RELAX:         qc.e.bgeui    a0, 0x5, 0xcb96
+# CHECK-INST-RELAX:         qc.e.bgeui    a0, 0x37, 0xcb96
 # CHECK-INST-RELAX-NEXT:    jal     zero, {{.*}}
-   qc.e.bltui a0, 5, .L11
+   qc.e.bltui a0, 55, .L11
 .fill 1300, 4, 0
 .L11:
    ret
 
-# CHECK-INST:         qc.e.bltui    a0, 0x6, 0xdff2
+# CHECK-INST:         qc.e.bltui    a0, 0x24, 0xdff2
 # CHECK-INST-NEXT:    jal     zero, 0xf442
-# CHECK-INST-RELAX:         qc.e.bltui    a0, 0x6, 0xdff2
+# CHECK-INST-RELAX:         qc.e.bltui    a0, 0x24, 0xdff2
 # CHECK-INST-RELAX-NEXT:    jal     zero, {{.*}}
-   qc.e.bgeui a0, 6, .L12
+   qc.e.bgeui a0, 36, .L12
 .fill 1300, 4, 0
 .L12:
    ret
 
+# Check that instructions are first compressed and then relaxed
+
+# CHECK-INST:         qc.beqi    a0, 0xa, 0xf44c
+# CHECK-INST-NEXT:    jal     zero, 0x1089c
+# CHECK-INST-RELAX:         qc.beqi    a0, 0xa, 0xf44c
+# CHECK-INST-RELAX-NEXT:    jal     zero, {{.*}}
+   qc.e.bnei a0, 10, .L13
+.fill 1300, 4, 0
+.L13:
+   ret
+
+# CHECK-INST:         qc.bnei    a0, 0xa, 0x108a6
+# CHECK-INST-NEXT:    jal     zero, 0x11cf6
+# CHECK-INST-RELAX:         qc.bnei    a0, 0xa, 0x108a6
+# CHECK-INST-RELAX-NEXT:    jal     zero, {{.*}}
+   qc.e.beqi a0, 10, .L14
+.fill 1300, 4, 0
+.L14:
+   ret
+
+# CHECK-INST:         qc.bgei    a0, 0xa, 0x11d00
+# CHECK-INST-NEXT:    jal     zero, 0x13150
+# CHECK-INST-RELAX:         qc.bgei    a0, 0xa, 0x11d00
+# CHECK-INST-RELAX-NEXT:    jal     zero, {{.*}}
+   qc.e.blti a0, 10, .L15
+.fill 1300, 4, 0
+.L15:
+   ret
+
+# CHECK-INST:         qc.blti    a0, 0xa, 0x1315a
+# CHECK-INST-NEXT:    jal     zero, 0x145aa
+# CHECK-INST-RELAX:         qc.blti    a0, 0xa, 0x1315a
+# CHECK-INST-RELAX-NEXT:    jal     zero, {{.*}}
+   qc.e.bgei a0, 10, .L16
+.fill 1300, 4, 0
+.L16:
+   ret
+
+# CHECK-INST:         qc.bgeui    a0, 0xa, 0x145b4
+# CHECK-INST-NEXT:    jal     zero, 0x15a04
+# CHECK-INST-RELAX:         qc.bgeui    a0, 0xa, 0x145b4
+# CHECK-INST-RELAX-NEXT:    jal     zero, {{.*}}
+   qc.e.bltui a0, 10, .L17
+.fill 1300, 4, 0
+.L17:
+   ret
+
+# CHECK-INST:         qc.bltui    a0, 0xa, 0x15a0e
+# CHECK-INST-NEXT:    jal     zero, 0x16e5e
+# CHECK-INST-RELAX:         qc.bltui    a0, 0xa, 0x15a0e
+# CHECK-INST-RELAX-NEXT:    jal     zero, {{.*}}
+   qc.e.bgeui a0, 10, .L18
+.fill 1300, 4, 0
+.L18:
+   ret
+
 .Lfunc_end0:
        .size   test, .Lfunc_end0-test
diff --git a/llvm/test/MC/RISCV/xqcibi-valid.s b/llvm/test/MC/RISCV/xqcibi-valid.s
index 63d35f3..88f7813 100644
--- a/llvm/test/MC/RISCV/xqcibi-valid.s
+++ b/llvm/test/MC/RISCV/xqcibi-valid.s
@@ -1,11 +1,11 @@
 # Xqcibi - Qualcomm uC Branch Immediate Extension
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcibi -M no-aliases -show-encoding \
-# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST,CHECK-NOALIAS %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcibi < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-xqcibi -M no-aliases -d - \
-# RUN:     | FileCheck -check-prefix=CHECK-OBJ %s
+# RUN:     | FileCheck -check-prefixes=CHECK-OBJ %s
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcibi -show-encoding \
-# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST,CHECK-ALIAS %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcibi < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-xqcibi -d - \
 # RUN:     | FileCheck -check-prefix=CHECK-OBJ %s
@@ -40,10 +40,10 @@ qc.bgeui x12, 11, 128
 # CHECK-ENC: encoding: [0x7b,0x6d,0x71,0x28]
 qc.bltui x2, 7, 666
 
-# CHECK-INST:  qc.e.beqi   ra, 1, 2
-# CHECK-OBJ:  qc.e.beqi   ra, 0x1, 0x1a
-# CHECK-ENC: encoding: [0x1f,0xc1,0x80,0x01,0x01,0x00]
-qc.e.beqi x1, 1, 2
+# CHECK-INST:  qc.e.beqi   ra, 111, 2
+# CHECK-OBJ:  qc.e.beqi   ra, 0x6f, 0x1a
+# CHECK-ENC: encoding: [0x1f,0xc1,0x80,0x01,0x6f,0x00]
+qc.e.beqi x1, 111, 2
 
 # CHECK-INST: qc.e.bnei   tp, 115, 4094
 # CHECK-OBJ: qc.e.bnei   tp, 0x73, 0x101c
@@ -65,7 +65,45 @@ qc.e.blti x1, 32767, 2000
 # CHECK-ENC: encoding: [0x1f,0x40,0xf6,0x09,0xc7,0x02]
 qc.e.bgeui x12, 711, 128
 
-# CHECK-INST: qc.e.bltui  sp, 7, 666
-# CHECK-OBJ: qc.e.bltui  sp, 0x7, 0x2d0
-# CHECK-ENC: encoding: [0x1f,0x4d,0xe1,0x29,0x07,0x00]
-qc.e.bltui x2, 7, 666
+# CHECK-INST: qc.e.bltui  sp, 77, 666
+# CHECK-OBJ: qc.e.bltui  sp, 0x4d, 0x2d0
+# CHECK-ENC: encoding: [0x1f,0x4d,0xe1,0x29,0x4d,0x00]
+qc.e.bltui x2, 77, 666
+
+# Check that compress patterns work as expected
+
+# CHECK-NOALIAS:  beqi   ra, 11, 2
+# CHECK-ALIAS: qc.beqi   ra, 11, 2
+# CHECK-OBJ:  qc.beqi   ra, 0xb, 0x3e
+# CHECK-ENC: encoding: [0x7b,0x81,0xb0,0x00]
+qc.e.beqi x1, 11, 2
+
+# CHECK-NOALIAS:  bnei   ra, 11, 2
+# CHECK-ALIAS: qc.bnei   ra, 11, 2
+# CHECK-OBJ:  qc.bnei   ra, 0xb, 0x42
+# CHECK-ENC: encoding: [0x7b,0x91,0xb0,0x00]
+qc.e.bnei x1, 11, 2
+
+# CHECK-NOALIAS:  bgei   ra, 11, 2
+# CHECK-ALIAS: qc.bgei   ra, 11, 2
+# CHECK-OBJ:  qc.bgei   ra, 0xb, 0x46
+# CHECK-ENC: encoding: [0x7b,0xd1,0xb0,0x00]
+qc.e.bgei x1, 11, 2
+
+# CHECK-NOALIAS:  blti   ra, 11, 2
+# CHECK-ALIAS: qc.blti   ra, 11, 2
+# CHECK-OBJ:  qc.blti   ra, 0xb, 0x4a
+# CHECK-ENC: encoding: [0x7b,0xc1,0xb0,0x00]
+qc.e.blti x1, 11, 2
+
+# CHECK-NOALIAS:  bgeui   ra, 11, 2
+# CHECK-ALIAS: qc.bgeui   ra, 11, 2
+# CHECK-OBJ:  qc.bgeui   ra, 0xb, 0x4e
+# CHECK-ENC: encoding: [0x7b,0xf1,0xb0,0x00]
+qc.e.bgeui x1, 11, 2
+
+# CHECK-NOALIAS:  bltui   ra, 11, 2
+# CHECK-ALIAS: qc.bltui   ra, 11, 2
+# CHECK-OBJ:  qc.bltui   ra, 0xb, 0x52
+# CHECK-ENC: encoding: [0x7b,0xe1,0xb0,0x00]
+qc.e.bltui x1, 11, 2
diff --git a/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td b/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td
index 570adbb..23e22b8 100644
--- a/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td
@@ -82,6 +82,7 @@ def HasC : Predicate<"Subtarget->hasC()"> { let RecomputePerFunction = 1; }
 // CHECK-NEXT:    bool testImmPredicate_APFloat(unsigned PredicateID, const APFloat &Imm) const override;
 // CHECK-NEXT:    const uint8_t *getMatchTable() const override;
 // CHECK-NEXT:    bool testMIPredicate_MI(unsigned PredicateID, const MachineInstr &MI, const MatcherState &State) const override;
+// CHECK-NEXT:    bool testMOPredicate_MO(unsigned PredicateID, const MachineOperand &MO, const MatcherState &State) const override;
 // CHECK-NEXT:    bool testSimplePredicate(unsigned PredicateID) const override;
 // CHECK-NEXT:    bool runCustomAction(unsigned FnID, const MatcherState &State, NewMIVector &OutMIs) const override;
 // CHECK-NEXT:  #endif // ifdef GET_GLOBALISEL_TEMPORARIES_DECL
@@ -159,6 +160,26 @@ def HasC : Predicate<"Subtarget->hasC()"> { let RecomputePerFunction = 1; }
 
 // CHECK-LABEL: // PatFrag predicates.
 // CHECK-NEXT:  enum {
+// CHECK-NEXT:   GICXXPred_MO_Predicate_leaf = GICXXPred_Invalid + 1,
+// CHECK-NEXT:  };
+
+// CHECK-LABEL:  bool MyTargetInstructionSelector::testMOPredicate_MO(unsigned PredicateID, const MachineOperand & MO, const MatcherState &State) const {
+// CHECK-NEXT:    const auto &Operands = State.RecordedOperands; 
+// CHECK-NEXT:    Register Reg = MO.getReg();
+// CHECK-NEXT:    (void)Operands; 
+// CHECK-NEXT:    (void)Reg;
+// CHECK-NEXT:    switch (PredicateID) {
+// CHECK-NEXT:    case GICXXPred_MO_Predicate_leaf: {
+// CHECK-NEXT:       return true;
+// CHECK-NEXT:    }
+// CHECK-NEXT:    }
+// CHECK-NEXT:    llvm_unreachable("Unknown predicate");
+// CHECK-NEXT:    return false;
+// CHECK-NEXT:  }
+
+
+// CHECK-LABEL: // PatFrag predicates.
+// CHECK-NEXT:  enum {
 // CHECK-NEXT:    GICXXPred_I64_Predicate_cimm8 = GICXXPred_Invalid + 1,
 // CHECK-NEXT:    GICXXPred_I64_Predicate_simm8,
 // CHECK-NEXT:  };
@@ -508,12 +529,12 @@ def : Pat<(frag GPR32:$src1, complex:$src2, complex:$src3),
 // R00C-NEXT:    GIR_EraseRootFromParent_Done,
 // R00C-NEXT:  // Label [[LABEL_NUM]]: @[[LABEL]]
 //
-// R00O-NEXT:  GIM_Reject,
+// R00O:       GIM_Reject,
 // R00O-NEXT:  // Label [[GROUP_NUM]]: @[[GROUP]]
 // R00O-NEXT:  GIM_Reject,
 // R00O:       // Label [[DEFAULT_NUM]]: @[[DEFAULT]]
 // R00O-NEXT:  GIM_Reject,
-// R00O-NEXT:  }; // Size: 1856 bytes
+// R00O-NEXT:  }; // Size: 1878 bytes
 
 def INSNBOB : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2, GPR32:$src3, GPR32:$src4),
                  [(set GPR32:$dst,
@@ -828,7 +849,7 @@ def : Pat<(not GPR32:$Wm), (ORN R0, GPR32:$Wm)>;
 // NOOPT-NEXT:    GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/2, // src2
 // NOOPT-NEXT:    GIR_RootToRootCopy, /*OpIdx*/1, // src3
 // NOOPT-NEXT:    GIR_RootConstrainSelectedInstOperands,
-// NOOPT-NEXT:    // GIR_Coverage, 28,
+// NOOPT-NEXT:    // GIR_Coverage, 29,
 // NOOPT-NEXT:    GIR_EraseRootFromParent_Done,
 // NOOPT-NEXT:  // Label [[LABEL_NUM]]: @[[LABEL]]
 
@@ -837,6 +858,35 @@ def MULADD : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2, GPR32:$src3),
                      (mul (add GPR32:$src1, GPR32:$src2), GPR32:$src3))]>,
              Requires<[HasA]>;
 
+//===- Test a simple pattern with a PatLeaf and a predicate. ---------===//
+//
+// NOOPT-NEXT:     /*  882 */ GIM_Try, /*On fail goto*//*Label 13*/ GIMT_Encode4(924), // Rule ID 24 //
+// NOOPT-NEXT:     /*  887 */   GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
+// NOOPT-NEXT:     /*  890 */   GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_SUB),
+// NOOPT-NEXT:     /*  894 */   // MIs[0] DstI[dst]
+// NOOPT-NEXT:     /*  894 */   GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
+// NOOPT-NEXT:     /*  897 */   GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// NOOPT-NEXT:     /*  901 */   // MIs[0] src1
+// NOOPT-NEXT:     /*  901 */   GIM_RootCheckType, /*Op*/1, /*Type*/GILLT_s32,
+// NOOPT-NEXT:     /*  904 */   GIM_CheckLeafOperandPredicate, /*MI*/0, /*MO*/1, /*Predicate*/GIMT_Encode2(GICXXPred_MO_Predicate_leaf),
+// NOOPT-NEXT:     /*  909 */   // MIs[0] src2
+// NOOPT-NEXT:     /*  909 */   GIM_RootCheckType, /*Op*/2, /*Type*/GILLT_s32,
+// NOOPT-NEXT:     /*  912 */   GIM_CheckLeafOperandPredicate, /*MI*/0, /*MO*/2, /*Predicate*/GIMT_Encode2(GICXXPred_MO_Predicate_leaf),
+// NOOPT-NEXT:     /*  917 */   // (sub:{ *:[i32] } GPR32:{ *:[i32] }<<P:Predicate_leaf>>:$src1, GPR32:{ *:[i32] }<<P:Predicate_leaf>>:$src2)  =>  (INSN5:{ *:[i32] } GPR32:{ *:[i32] }:$src1, GPR32:{ *:[i32] }:$src2)
+// NOOPT-NEXT:     /*  917 */   GIR_MutateOpcode, /*InsnID*/0, /*RecycleInsnID*/0, /*Opcode*/GIMT_Encode2(MyTarget::INSN5),
+// NOOPT-NEXT:     /*  922 */   GIR_RootConstrainSelectedInstOperands,
+// NOOPT-NEXT:     /*  923 */   // GIR_Coverage, 24,
+// NOOPT-NEXT:     /*  923 */   GIR_Done,
+// NOOPT-NEXT:     /*  924 */ // Label 13: @924
+
+def leaf: PatLeaf<(i32 GPR32:$src), [{ return true; // C++ code }]> {
+  let GISelLeafPredicateCode = [{ return true; }];
+}
+def INSN5 : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2), []>;
+def : Pat<(sub leaf:$src1, leaf:$src2), (INSN5 GPR32:$src1, GPR32:$src2)>;
+
+
+
 //===- Test a simple pattern with just a specific leaf immediate. ---------===//
 //
 // NOOPT-NEXT:  GIM_Try, /*On fail goto*//*Label [[LABEL_NUM:[0-9]+]]*/ GIMT_Encode4([[LABEL:[0-9]+]]),
@@ -984,7 +1034,7 @@ def LOAD : I<(outs GPR32:$dst), (ins GPR32:$src1),
 // NOOPT-NEXT:    // (ld:{ *:[i32] } GPR32:{ *:[i32] }:$src)<<P:Predicate_unindexedload>><<P:Predicate_load>>  =>  (LOAD:{ *:[i32] } GPR32:{ *:[i32] }:$src)
 // NOOPT-NEXT:    GIR_MutateOpcode, /*InsnID*/0, /*RecycleInsnID*/0, /*Opcode*/GIMT_Encode2(MyTarget::LOAD),
 // NOOPT-NEXT:    GIR_RootConstrainSelectedInstOperands,
-// NOOPT-NEXT:    // GIR_Coverage, 24,
+// NOOPT-NEXT:    // GIR_Coverage, 25,
 // NOOPT-NEXT:    GIR_Done,
 // NOOPT-NEXT:  // Label [[LABEL_NUM]]: @[[LABEL]]
 
@@ -1083,7 +1133,7 @@ def DOUBLE : I<(outs GPR32:$dst), (ins GPR32:$src), [(set GPR32:$dst, (add GPR32
 // NOOPT-NEXT:    // (add:{ *:[i32] } i32:{ *:[i32] }:$samename, i32:{ *:[i32] }:$othername)  =>  (InsnWithSpeciallyNamedDef:{ *:[i32] } i32:{ *:[i32] }:$samename, i32:{ *:[i32] }:$othername)
 // NOOPT-NEXT:    GIR_MutateOpcode, /*InsnID*/0, /*RecycleInsnID*/0, /*Opcode*/GIMT_Encode2(MyTarget::InsnWithSpeciallyNamedDef),
 // NOOPT-NEXT:    GIR_RootConstrainSelectedInstOperands,
-// NOOPT-NEXT:    // GIR_Coverage, 25,
+// NOOPT-NEXT:    // GIR_Coverage, 26,
 // NOOPT-NEXT:    GIR_Done,
 // NOOPT-NEXT:  // Label [[LABEL_NUM]]: @[[LABEL]]
 
@@ -1106,7 +1156,7 @@ def : Pat<(add i32:$samename, i32:$othername),
 // NOOPT-NEXT:    // (add:{ *:[i32] } i32:{ *:[i32] }:$src1, i32:{ *:[i32] }:$src2) => (ADD:{ *:[i32] } i32:{ *:[i32] }:$src1, i32:{ *:[i32] }:$src2)
 // NOOPT-NEXT:    GIR_MutateOpcode, /*InsnID*/0, /*RecycleInsnID*/0, /*Opcode*/GIMT_Encode2(MyTarget::ADD),
 // NOOPT-NEXT:    GIR_RootConstrainSelectedInstOperands,
-// NOOPT-NEXT:    // GIR_Coverage, 26,
+// NOOPT-NEXT:    // GIR_Coverage, 27,
 // NOOPT-NEXT:    GIR_Done,
 // NOOPT-NEXT:  // Label [[LABEL_NUM]]: @[[LABEL]]
 
@@ -1157,7 +1207,7 @@ def MUL : I<(outs GPR32:$dst), (ins GPR32:$src2, GPR32:$src1),
 // NOOPT-NEXT:    // (bitconvert:{ *:[i32] } FPR32:{ *:[f32] }:$src1) => (COPY_TO_REGCLASS:{ *:[i32] } FPR32:{ *:[f32] }:$src1, GPR32:{ *:[i32] })
 // NOOPT-NEXT:    GIR_MutateOpcode, /*InsnID*/0, /*RecycleInsnID*/0, /*Opcode*/GIMT_Encode2(TargetOpcode::COPY),
 // NOOPT-NEXT:    GIR_ConstrainOperandRC, /*InsnID*/0, /*Op*/0, GIMT_Encode2(MyTarget::GPR32RegClassID),
-// NOOPT-NEXT:    // GIR_Coverage, 27,
+// NOOPT-NEXT:    // GIR_Coverage, 28,
 // NOOPT-NEXT:    GIR_Done,
 // NOOPT-NEXT:  // Label [[LABEL_NUM]]: @[[LABEL]]
 
@@ -1206,5 +1256,5 @@ def BR : I<(outs), (ins unknown:$target),
             [(br bb:$target)]>;
 
 // NOOPT-NEXT:    GIM_Reject,
-// NOOPT-NEXT:  }; // Size: 1459 bytes
+// NOOPT-NEXT:  }; // Size: 1501 bytes
 // NOOPT-NEXT:  return MatchTable0;
diff --git a/llvm/test/TableGen/GlobalISelEmitter/HwModes.td b/llvm/test/TableGen/GlobalISelEmitter/HwModes.td
index 5103685..f112577 100644
--- a/llvm/test/TableGen/GlobalISelEmitter/HwModes.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/HwModes.td
@@ -54,6 +54,7 @@ class I<dag OOps, dag IOps, list<dag> Pat>
 // CHECK-NEXT:    bool testImmPredicate_APFloat(unsigned PredicateID, const APFloat &Imm) const override;
 // CHECK-NEXT:    const uint8_t *getMatchTable() const override;
 // CHECK-NEXT:    bool testMIPredicate_MI(unsigned PredicateID, const MachineInstr &MI, const MatcherState &State) const override;
+// CHECK-NEXT:    bool testMOPredicate_MO(unsigned PredicateID, const MachineOperand &MO, const MatcherState &State) const override;
 // CHECK-NEXT:    bool testSimplePredicate(unsigned PredicateID) const override;
 // CHECK-NEXT:    bool runCustomAction(unsigned FnID, const MatcherState &State, NewMIVector &OutMIs) const override;
 // CHECK-NEXT:  #endif // ifdef GET_GLOBALISEL_TEMPORARIES_DECL
diff --git a/llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll b/llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll
index c6695f1..8c69d07 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll
@@ -13,6 +13,17 @@ define <16 x i8> @combineXorAeseZeroARM64(<16 x i8> %data, <16 x i8> %key) {
   ret <16 x i8> %data.aes
 }
 
+define <16 x i8> @combineXorAeseZeroLhsARM64(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: define <16 x i8> @combineXorAeseZeroLhsARM64(
+; CHECK-SAME: <16 x i8> [[DATA:%.*]], <16 x i8> [[KEY:%.*]]) {
+; CHECK-NEXT:    [[DATA_AES:%.*]] = tail call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> [[DATA]], <16 x i8> [[KEY]])
+; CHECK-NEXT:    ret <16 x i8> [[DATA_AES]]
+;
+  %data.xor = xor <16 x i8> %data, %key
+  %data.aes = tail call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> zeroinitializer, <16 x i8> %data.xor)
+  ret <16 x i8> %data.aes
+}
+
 define <16 x i8> @combineXorAeseNonZeroARM64(<16 x i8> %data, <16 x i8> %key) {
 ; CHECK-LABEL: define <16 x i8> @combineXorAeseNonZeroARM64(
 ; CHECK-SAME: <16 x i8> [[DATA:%.*]], <16 x i8> [[KEY:%.*]]) {
@@ -36,6 +47,17 @@ define <16 x i8> @combineXorAesdZeroARM64(<16 x i8> %data, <16 x i8> %key) {
   ret <16 x i8> %data.aes
 }
 
+define <16 x i8> @combineXorAesdZeroLhsARM64(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: define <16 x i8> @combineXorAesdZeroLhsARM64(
+; CHECK-SAME: <16 x i8> [[DATA:%.*]], <16 x i8> [[KEY:%.*]]) {
+; CHECK-NEXT:    [[DATA_AES:%.*]] = tail call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> [[DATA]], <16 x i8> [[KEY]])
+; CHECK-NEXT:    ret <16 x i8> [[DATA_AES]]
+;
+  %data.xor = xor <16 x i8> %data, %key
+  %data.aes = tail call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> zeroinitializer, <16 x i8> %data.xor)
+  ret <16 x i8> %data.aes
+}
+
 define <16 x i8> @combineXorAesdNonZeroARM64(<16 x i8> %data, <16 x i8> %key) {
 ; CHECK-LABEL: define <16 x i8> @combineXorAesdNonZeroARM64(
 ; CHECK-SAME: <16 x i8> [[DATA:%.*]], <16 x i8> [[KEY:%.*]]) {
@@ -51,3 +73,51 @@ define <16 x i8> @combineXorAesdNonZeroARM64(<16 x i8> %data, <16 x i8> %key) {
 declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8>, <16 x i8>) #0
 declare <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8>, <16 x i8>) #0
 
+; SVE
+
+define <vscale x 16 x i8> @combineXorAeseZeroLhsSVE(<vscale x 16 x i8> %data, <vscale x 16 x i8> %key) {
+; CHECK-LABEL: define <vscale x 16 x i8> @combineXorAeseZeroLhsSVE(
+; CHECK-SAME: <vscale x 16 x i8> [[DATA:%.*]], <vscale x 16 x i8> [[KEY:%.*]]) {
+; CHECK-NEXT:    [[DATA_AES:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.aese(<vscale x 16 x i8> [[DATA]], <vscale x 16 x i8> [[KEY]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[DATA_AES]]
+;
+  %data.xor = xor <vscale x 16 x i8> %data, %key
+  %data.aes = tail call <vscale x 16 x i8> @llvm.aarch64.sve.aese(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> %data.xor)
+  ret <vscale x 16 x i8> %data.aes
+}
+
+define <vscale x 16 x i8> @combineXorAeseZeroRhsSVE(<vscale x 16 x i8> %data, <vscale x 16 x i8> %key) {
+; CHECK-LABEL: define <vscale x 16 x i8> @combineXorAeseZeroRhsSVE(
+; CHECK-SAME: <vscale x 16 x i8> [[DATA:%.*]], <vscale x 16 x i8> [[KEY:%.*]]) {
+; CHECK-NEXT:    [[DATA_AES:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.aese(<vscale x 16 x i8> [[DATA]], <vscale x 16 x i8> [[KEY]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[DATA_AES]]
+;
+  %data.xor = xor <vscale x 16 x i8> %data, %key
+  %data.aes = tail call <vscale x 16 x i8> @llvm.aarch64.sve.aese(<vscale x 16 x i8> %data.xor, <vscale x 16 x i8> zeroinitializer)
+  ret <vscale x 16 x i8> %data.aes
+}
+
+define <vscale x 16 x i8> @combineXorAesdZeroLhsSVE(<vscale x 16 x i8> %data, <vscale x 16 x i8> %key) {
+; CHECK-LABEL: define <vscale x 16 x i8> @combineXorAesdZeroLhsSVE(
+; CHECK-SAME: <vscale x 16 x i8> [[DATA:%.*]], <vscale x 16 x i8> [[KEY:%.*]]) {
+; CHECK-NEXT:    [[DATA_AES:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.aesd(<vscale x 16 x i8> [[DATA]], <vscale x 16 x i8> [[KEY]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[DATA_AES]]
+;
+  %data.xor = xor <vscale x 16 x i8> %data, %key
+  %data.aes = tail call <vscale x 16 x i8> @llvm.aarch64.sve.aesd(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> %data.xor)
+  ret <vscale x 16 x i8> %data.aes
+}
+
+define <vscale x 16 x i8> @combineXorAesdZeroRhsSVE(<vscale x 16 x i8> %data, <vscale x 16 x i8> %key) {
+; CHECK-LABEL: define <vscale x 16 x i8> @combineXorAesdZeroRhsSVE(
+; CHECK-SAME: <vscale x 16 x i8> [[DATA:%.*]], <vscale x 16 x i8> [[KEY:%.*]]) {
+; CHECK-NEXT:    [[DATA_AES:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.aesd(<vscale x 16 x i8> [[DATA]], <vscale x 16 x i8> [[KEY]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[DATA_AES]]
+;
+  %data.xor = xor <vscale x 16 x i8> %data, %key
+  %data.aes = tail call <vscale x 16 x i8> @llvm.aarch64.sve.aesd(<vscale x 16 x i8> %data.xor, <vscale x 16 x i8> zeroinitializer)
+  ret <vscale x 16 x i8> %data.aes
+}
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.aese(<vscale x 16 x i8>, <vscale x 16 x i8>) #0
+declare <vscale x 16 x i8> @llvm.aarch64.sve.aesd(<vscale x 16 x i8>, <vscale x 16 x i8>) #0
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index 9375d56..077da9c 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -2743,7 +2743,7 @@ declare i32 @llvm.amdgcn.readfirstlane(i32)
 
 @gv = constant i32 0
 
-define amdgpu_kernel void @readfirstlane_constant(i32 %arg, ptr %ptr) {
+define amdgpu_cs void @readfirstlane_constant(i32 %arg, ptr %ptr) {
 ; CHECK-LABEL: @readfirstlane_constant(
 ; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]])
 ; CHECK-NEXT:    store volatile i32 [[VAR]], ptr [[PTR:%.*]], align 4
@@ -2829,7 +2829,7 @@ bb1:
 
 declare i32 @llvm.amdgcn.readlane(i32, i32)
 
-define amdgpu_kernel void @readlane_constant(i32 %arg, i32 %lane, ptr %ptr) {
+define amdgpu_cs void @readlane_constant(i32 %arg, i32 %lane, ptr %ptr) {
 ; CHECK-LABEL: @readlane_constant(
 ; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 7)
 ; CHECK-NEXT:    store volatile i32 [[VAR]], ptr [[PTR:%.*]], align 4
@@ -3041,14 +3041,12 @@ define amdgpu_kernel void @permlanex16_fetch_invalid_bound_ctrl(ptr addrspace(1)
 ; llvm.amdgcn.permlane64
 ; --------------------------------------------------------------------
 
-define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src0) {
+define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
 ; CHECK-LABEL: @permlane64_uniform(
-; CHECK-NEXT:    [[SRC1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0:%.*]])
-; CHECK-NEXT:    store i32 [[SRC1]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[SRC1:%.*]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %src1 = call i32 @llvm.amdgcn.readfirstlane(i32 %src0)
-  %res = call i32 @llvm.amdgcn.permlane64(i32 %src1)
+  %res = call i32 @llvm.amdgcn.permlane64(i32 %src)
   store i32 %res, ptr addrspace(1) %out
   ret void
 }
@@ -6486,7 +6484,7 @@ define i32 @prng_poison_i32() {
 ; llvm.amdgcn.ds.bpermute
 ; --------------------------------------------------------------------
 
-define amdgpu_kernel void @ds_bpermute_uniform_src(ptr addrspace(1) %out, i32 %lane) {
+define void @ds_bpermute_uniform_src(ptr addrspace(1) %out, i32 %lane) {
 ; CHECK-LABEL: @ds_bpermute_uniform_src(
 ; CHECK-NEXT:    store i32 7, ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
@@ -6496,7 +6494,7 @@ define amdgpu_kernel void @ds_bpermute_uniform_src(ptr addrspace(1) %out, i32 %l
   ret void
 }
 
-define amdgpu_kernel void @ds_bpermute_constant_lane(ptr addrspace(1) %out, i32 %src) {
+define void @ds_bpermute_constant_lane(ptr addrspace(1) %out, i32 %src) {
 ; CHECK-LABEL: @ds_bpermute_constant_lane(
 ; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[SRC:%.*]], i32 7)
 ; CHECK-NEXT:    store i32 [[V]], ptr addrspace(1) [[OUT:%.*]], align 4
@@ -6507,7 +6505,7 @@ define amdgpu_kernel void @ds_bpermute_constant_lane(ptr addrspace(1) %out, i32
   ret void
 }
 
-define amdgpu_kernel void @ds_bpermute_uniform_lane(ptr addrspace(1) %out, i32 %lanearg, i32 %src) {
+define void @ds_bpermute_uniform_lane(ptr addrspace(1) %out, i32 %lanearg, i32 %src) {
 ; CHECK-LABEL: @ds_bpermute_uniform_lane(
 ; CHECK-NEXT:    [[LANE:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[LANEARG:%.*]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[LANE]], 2
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/trivially-uniform.ll b/llvm/test/Transforms/InstCombine/AMDGPU/trivially-uniform.ll
new file mode 100644
index 0000000..2a3e392
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/trivially-uniform.ll
@@ -0,0 +1,163 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn -mcpu=gfx1010 -passes=instcombine -S < %s | FileCheck %s
+
+; Use readfirstlane to demonstrate when InstCombine deems an input to
+; be trivially uniform.
+
+; Constants are trivially uniform.
+define i32 @test_constant() {
+; CHECK-LABEL: define i32 @test_constant(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret i32 7
+;
+  %r = call i32 @llvm.amdgcn.readfirstlane(i32 7)
+  ret i32 %r
+}
+
+; The result of an AlwaysUniform intrinsic is trivially uniform.
+define i32 @test_intrinsic(i32 %x) {
+; CHECK-LABEL: define i32 @test_intrinsic(
+; CHECK-SAME: i32 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[Y:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[X]])
+; CHECK-NEXT:    ret i32 [[Y]]
+;
+  %y = call i32 @llvm.amdgcn.readfirstlane(i32 %x)
+  %r = call i32 @llvm.amdgcn.readfirstlane(i32 %y)
+  ret i32 %r
+}
+
+; In compute kernels, all arguments are trivially uniform.
+
+define amdgpu_kernel void @test_compute_i32(ptr %out, i32 %x) {
+; CHECK-LABEL: define amdgpu_kernel void @test_compute_i32(
+; CHECK-SAME: ptr [[OUT:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store i32 [[X]], ptr [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %r = call i32 @llvm.amdgcn.readfirstlane(i32 %x)
+  store i32 %r, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @test_compute_i1(ptr %out, i1 %x) {
+; CHECK-LABEL: define amdgpu_kernel void @test_compute_i1(
+; CHECK-SAME: ptr [[OUT:%.*]], i1 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store i1 [[X]], ptr [[OUT]], align 1
+; CHECK-NEXT:    ret void
+;
+  %r = call i1 @llvm.amdgcn.readfirstlane(i1 %x)
+  store i1 %r, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @test_compute_v32i1(ptr %out, <32 x i1> %x) {
+; CHECK-LABEL: define amdgpu_kernel void @test_compute_v32i1(
+; CHECK-SAME: ptr [[OUT:%.*]], <32 x i1> [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store <32 x i1> [[X]], ptr [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %r = call <32 x i1> @llvm.amdgcn.readfirstlane(<32 x i1> %x)
+  store <32 x i1> %r, ptr %out
+  ret void
+}
+
+; In graphics shaders, inreg arguments are trivially uniform.
+
+define amdgpu_ps i32 @test_graphics_i32(i32 inreg %x) {
+; CHECK-LABEL: define amdgpu_ps i32 @test_graphics_i32(
+; CHECK-SAME: i32 inreg [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %r = call i32 @llvm.amdgcn.readfirstlane(i32 %x)
+  ret i32 %r
+}
+
+define amdgpu_ps i1 @test_graphics_i1(i1 inreg %x) {
+; CHECK-LABEL: define amdgpu_ps i1 @test_graphics_i1(
+; CHECK-SAME: i1 inreg [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i1 [[X]]
+;
+  %r = call i1 @llvm.amdgcn.readfirstlane(i1 %x)
+  ret i1 %r
+}
+
+define amdgpu_ps <32 x i1> @test_graphics_v32i1(<32 x i1> inreg %x) {
+; CHECK-LABEL: define amdgpu_ps <32 x i1> @test_graphics_v32i1(
+; CHECK-SAME: <32 x i1> inreg [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <32 x i1> [[X]]
+;
+  %r = call <32 x i1> @llvm.amdgcn.readfirstlane(<32 x i1> %x)
+  ret <32 x i1> %r
+}
+
+; In graphics shaders, non-inreg arguments are not trivially uniform.
+
+define amdgpu_ps i32 @test_graphics_i32_negative(i32 %x) {
+; CHECK-LABEL: define amdgpu_ps i32 @test_graphics_i32_negative(
+; CHECK-SAME: i32 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[X]])
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %r = call i32 @llvm.amdgcn.readfirstlane(i32 %x)
+  ret i32 %r
+}
+
+define amdgpu_ps i1 @test_graphics_i1_negative(i1 %x) {
+; CHECK-LABEL: define amdgpu_ps i1 @test_graphics_i1_negative(
+; CHECK-SAME: i1 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[R:%.*]] = call i1 @llvm.amdgcn.readfirstlane.i1(i1 [[X]])
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %r = call i1 @llvm.amdgcn.readfirstlane(i1 %x)
+  ret i1 %r
+}
+
+define amdgpu_ps <32 x i1> @test_graphics_v32i1_negative(<32 x i1> %x) {
+; CHECK-LABEL: define amdgpu_ps <32 x i1> @test_graphics_v32i1_negative(
+; CHECK-SAME: <32 x i1> [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[R:%.*]] = call <32 x i1> @llvm.amdgcn.readfirstlane.v32i1(<32 x i1> [[X]])
+; CHECK-NEXT:    ret <32 x i1> [[R]]
+;
+  %r = call <32 x i1> @llvm.amdgcn.readfirstlane(<32 x i1> %x)
+  ret <32 x i1> %r
+}
+
+; Test i1 arguments in non-entry functions.
+
+define amdgpu_gfx i1 @test_callable_i1(i1 inreg %x) {
+; CHECK-LABEL: define amdgpu_gfx i1 @test_callable_i1(
+; CHECK-SAME: i1 inreg [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i1 [[X]]
+;
+  %r = call i1 @llvm.amdgcn.readfirstlane(i1 %x)
+  ret i1 %r
+}
+
+define amdgpu_gfx <32 x i1> @test_callable_v32i1(<32 x i1> inreg %x) {
+; CHECK-LABEL: define amdgpu_gfx <32 x i1> @test_callable_v32i1(
+; CHECK-SAME: <32 x i1> inreg [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret <32 x i1> [[X]]
+;
+  %r = call <32 x i1> @llvm.amdgcn.readfirstlane(<32 x i1> %x)
+  ret <32 x i1> %r
+}
+
+define amdgpu_gfx i1 @test_callable_i1_negative(i1 %x) {
+; CHECK-LABEL: define amdgpu_gfx i1 @test_callable_i1_negative(
+; CHECK-SAME: i1 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[R:%.*]] = call i1 @llvm.amdgcn.readfirstlane.i1(i1 [[X]])
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %r = call i1 @llvm.amdgcn.readfirstlane(i1 %x)
+  ret i1 %r
+}
+
+define amdgpu_gfx <32 x i1> @test_callable_v32i1_negative(<32 x i1> %x) {
+; CHECK-LABEL: define amdgpu_gfx <32 x i1> @test_callable_v32i1_negative(
+; CHECK-SAME: <32 x i1> [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[R:%.*]] = call <32 x i1> @llvm.amdgcn.readfirstlane.v32i1(<32 x i1> [[X]])
+; CHECK-NEXT:    ret <32 x i1> [[R]]
+;
+  %r = call <32 x i1> @llvm.amdgcn.readfirstlane(<32 x i1> %x)
+  ret <32 x i1> %r
+}
diff --git a/llvm/test/Transforms/InstCombine/fcmp-select.ll b/llvm/test/Transforms/InstCombine/fcmp-select.ll
index 053b233..b622c86 100644
--- a/llvm/test/Transforms/InstCombine/fcmp-select.ll
+++ b/llvm/test/Transforms/InstCombine/fcmp-select.ll
@@ -270,12 +270,36 @@ define i1 @test_fcmp_select_var_const_unordered(double %x, double %y) {
 }
 
 define i1 @test_fcmp_ord_select_fcmp_oeq_var_const(double %x) {
-; CHECK-LABEL:    @test_fcmp_ord_select_fcmp_oeq_var_const(
-; CHECK-NEXT:     [[CMP1:%.*]] = fcmp oeq double [[X:%.*]], 1.000000e+00
-; CHECK-NEXT:     ret i1 [[CMP1]]
+; CHECK-LABEL: @test_fcmp_ord_select_fcmp_oeq_var_const(
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp oeq double [[X:%.*]], 1.000000e+00
+; CHECK-NEXT:    ret i1 [[TMP1]]
 ;
   %cmp1 = fcmp ord double %x, 0.000000e+00
   %sel = select i1 %cmp1, double %x, double 0.000000e+00
   %cmp2 = fcmp oeq double %sel, 1.000000e+00
   ret i1 %cmp2
 }
+
+; Make sure that we recognize the SPF correctly.
+
+define float @test_select_nnan_nsz_fcmp_olt(float %x) {
+; CHECK-LABEL: @test_select_nnan_nsz_fcmp_olt(
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp olt float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[TMP1]], float [[X]], float -0.000000e+00
+; CHECK-NEXT:    ret float [[SEL1]]
+;
+  %cmp = fcmp olt float %x, 0.000000e+00
+  %sel = select nnan nsz i1 %cmp, float %x, float -0.000000e+00
+  ret float %sel
+}
+
+define float @test_select_nnan_nsz_fcmp_ult(float %x) {
+; CHECK-LABEL: @test_select_nnan_nsz_fcmp_ult(
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[DOTINV]], float -0.000000e+00, float [[X]]
+; CHECK-NEXT:    ret float [[SEL1]]
+;
+  %cmp = fcmp ult float %x, 0.000000e+00
+  %sel = select nnan nsz i1 %cmp, float %x, float -0.000000e+00
+  ret float %sel
+}
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll b/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll
new file mode 100644
index 0000000..d824d6d
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instsimplify < %s | FileCheck %s
+
+define double @test_atan_0() {
+; CHECK-LABEL: define double @test_atan_0() {
+; CHECK-NEXT:    ret double 0.000000e+00
+;
+  %result = call double @llvm.atan.f64(double 0.0)
+  ret double %result
+}
+
+define double @test_atan_one() {
+; CHECK-LABEL: define double @test_atan_one() {
+; CHECK-NEXT:    ret double 0x3FE921FB54442D18
+;
+  %res = call double @llvm.atan.f64(double 1.0)
+  ret double %res
+}
+
+define <2 x double> @test_atan_v2() {
+; CHECK-LABEL: define <2 x double> @test_atan_v2() {
+; CHECK-NEXT:    ret <2 x double> zeroinitializer
+;
+  %result = call <2 x double> @llvm.atan.v2f64(<2 x double> zeroinitializer)
+  ret <2 x double> %result
+}
+
+define double @test_atan_neg0() {
+; CHECK-LABEL: define double @test_atan_neg0() {
+; CHECK-NEXT:    ret double -0.000000e+00
+;
+  %res = call double @llvm.atan.f64(double -0.0)
+  ret double %res
+}
+
+define double @test_atan_poison() {
+; CHECK-LABEL: define double @test_atan_poison() {
+; CHECK-NEXT:    [[RES:%.*]] = call double @llvm.atan.f64(double poison)
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %res = call double @llvm.atan.f64(double poison)
+  ret double %res
+}
+
+define double @test_atan_undef() {
+; CHECK-LABEL: define double @test_atan_undef() {
+; CHECK-NEXT:    [[RES:%.*]] = call double @llvm.atan.f64(double undef)
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %res = call double @llvm.atan.f64(double undef)
+  ret double %res
+}
+
+define double @test_atan_snan() {
+; CHECK-LABEL: define double @test_atan_snan() {
+; CHECK-NEXT:    [[RES:%.*]] = call double @llvm.atan.f64(double 0x7FF0000000000001)
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %res = call double @llvm.atan.f64(double 0x7ff0000000000001)
+  ret double %res
+}
+
+define double @test_atan_qnan() {
+; CHECK-LABEL: define double @test_atan_qnan() {
+; CHECK-NEXT:    [[RES:%.*]] = call double @llvm.atan.f64(double 0x7FF8000000000000)
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %res = call double @llvm.atan.f64(double 0x7ff8000000000000)
+  ret double %res
+}
+
+define double @test_atan_pos_inf() {
+; CHECK-LABEL: define double @test_atan_pos_inf() {
+; CHECK-NEXT:    [[RES:%.*]] = call double @llvm.atan.f64(double 0x7FF0000000000000)
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %res = call double @llvm.atan.f64(double 0x7ff0000000000000)
+  ret double %res
+}
+
+define double @test_atan_neg_inf() {
+; CHECK-LABEL: define double @test_atan_neg_inf() {
+; CHECK-NEXT:    [[RES:%.*]] = call double @llvm.atan.f64(double 0xFFF0000000000000)
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %res = call double @llvm.atan.f64(double 0xfff0000000000000)
+  ret double %res
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
index d449408..5508a65 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
@@ -60,16 +60,16 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[N_MOD_VF4:%.*]] = urem i32 [[TMP2]], 8
 ; CHECK-NEXT:    [[N_VEC5:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF4]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = trunc i32 [[N_VEC5]] to i8
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[TMP15]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i8> [[DOTSPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i8> [[DOTSPLAT]], <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
-; CHECK-NEXT:    [[DOTSPLATINSERT10:%.*]] = insertelement <8 x i8> poison, i8 [[TMP15]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT10:%.*]] = insertelement <8 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT11:%.*]] = shufflevector <8 x i8> [[DOTSPLATINSERT10]], <8 x i8> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i8> [[DOTSPLAT11]], <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX6:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND7:%.*]] = phi <8 x i8> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT8:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI9:%.*]] = phi <8 x i8> [ [[DOTSPLAT11]], %[[VEC_EPILOG_PH]] ], [ [[TMP20:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI9:%.*]] = phi <8 x i8> [ [[DOTSPLAT]], %[[VEC_EPILOG_PH]] ], [ [[TMP20:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[IV:%.*]] = trunc i32 [[INDEX6]] to i8
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[IV]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i32 0
@@ -87,12 +87,12 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[CMP_N16:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC5]]
 ; CHECK-NEXT:    br i1 [[CMP_N16]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL17:%.*]] = phi i8 [ [[TMP16]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX18:%.*]] = phi i8 [ [[RDX_SELECT15]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL15:%.*]] = phi i8 [ [[TMP16]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX16:%.*]] = phi i8 [ [[RDX_SELECT15]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i8 [ [[BC_RESUME_VAL17]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i8 [ [[BC_MERGE_RDX18]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i8 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i8 [ [[BC_MERGE_RDX16]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[IV1]]
 ; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP1]], align 8
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 3
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index 0e5e785a..c3fc91c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -161,8 +161,8 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i8> poison, i8 [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT6]], <4 x i8> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IDX_NEG]], [[N_VEC5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = sext <4 x i8> [[BROADCAST_SPLAT7]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <4 x i8> [[BROADCAST_SPLAT7]] to <4 x i32>
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX9:%.*]] = phi i64 [ [[IV]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index a9d5b5d..8095f25 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -17,12 +17,13 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<4>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
 ; CHECK-NEXT: vector.body:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, ir<[[REDUCE:%.+]]> (VF scaled by 1/4)
+; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi vp<[[RDX_START]]>, ir<[[REDUCE:%.+]]> (VF scaled by 1/4)
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
 ; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]>
 ; CHECK-NEXT:   vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a>
@@ -83,11 +84,12 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<vector.ph>:
+; CHECK-NEXT:   EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<4>
 ; CHECK-NEXT: Successor(s): vector.body
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.body:
 ; CHECK-NEXT:   EMIT-SCALAR vp<[[EP_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
-; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%accum> = phi ir<0>, ir<%add> (VF scaled by 1/4)
+; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%accum> = phi vp<[[RDX_START]]>, ir<%add> (VF scaled by 1/4)
 ; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[EP_IV]]>
 ; CHECK-NEXT:   vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a>
 ; CHECK-NEXT:   WIDEN ir<%load.a> = load vp<[[PTR_A]]>
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
index f1947de..b498712 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
@@ -153,10 +153,10 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[N_MOD_VF24:%.*]] = urem i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_VEC25:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF24]]
 ; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC25]]
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[VEC_EPILOG_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i64> [[DOTSPLAT]], <i64 0, i64 1>
-; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX38:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT32:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
new file mode 100644
index 0000000..1b0feef
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
@@ -0,0 +1,408 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" --version 2
+; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=scalar-epilogue %s 2>&1 | FileCheck %s -check-prefix=SCALAR_EPILOGUE
+; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=predicate-dont-vectorize %s 2>&1 | FileCheck %s -check-prefix=PREDICATED_TAIL_FOLDING
+; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-tail-folding-style=data-with-evl %s 2>&1 | FileCheck %s -check-prefix=PREDICATED_EVL
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i8 zeroext %guard) {
+; SCALAR_EPILOGUE-LABEL: define void @masked_strided_factor2
+; SCALAR_EPILOGUE-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0:[0-9]+]] {
+; SCALAR_EPILOGUE-NEXT:  entry:
+; SCALAR_EPILOGUE-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
+; SCALAR_EPILOGUE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_EPILOGUE-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 3
+; SCALAR_EPILOGUE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
+; SCALAR_EPILOGUE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALAR_EPILOGUE:       vector.ph:
+; SCALAR_EPILOGUE-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_EPILOGUE-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 3
+; SCALAR_EPILOGUE-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
+; SCALAR_EPILOGUE-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
+; SCALAR_EPILOGUE-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_EPILOGUE-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 3
+; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x i32> @llvm.stepvector.nxv8i32()
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[CONV]], i64 0
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; SCALAR_EPILOGUE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_EPILOGUE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[DOTSPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; SCALAR_EPILOGUE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALAR_EPILOGUE:       vector.body:
+; SCALAR_EPILOGUE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_EPILOGUE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_EPILOGUE-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SCALAR_EPILOGUE-NEXT:    [[TMP8:%.*]] = shl nuw nsw <vscale x 8 x i32> [[VEC_IND]], splat (i32 1)
+; SCALAR_EPILOGUE-NEXT:    [[TMP9:%.*]] = zext nneg <vscale x 8 x i32> [[TMP8]] to <vscale x 8 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP9]]
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP10]], i32 1, <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP11:%.*]] = or disjoint <vscale x 8 x i32> [[TMP8]], splat (i32 1)
+; SCALAR_EPILOGUE-NEXT:    [[TMP12:%.*]] = zext nneg <vscale x 8 x i32> [[TMP11]] to <vscale x 8 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP12]]
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP13]], i32 1, <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP14:%.*]] = call <vscale x 8 x i8> @llvm.smax.nxv8i8(<vscale x 8 x i8> [[WIDE_MASKED_GATHER]], <vscale x 8 x i8> [[WIDE_MASKED_GATHER1]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 8 x i32> [[TMP8]] to <vscale x 8 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP15]]
+; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP14]], <vscale x 8 x ptr> [[TMP16]], i32 1, <vscale x 8 x i1> [[TMP7]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP17:%.*]] = sub <vscale x 8 x i8> zeroinitializer, [[TMP14]]
+; SCALAR_EPILOGUE-NEXT:    [[TMP18:%.*]] = zext nneg <vscale x 8 x i32> [[TMP11]] to <vscale x 8 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP18]]
+; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP17]], <vscale x 8 x ptr> [[TMP19]], i32 1, <vscale x 8 x i1> [[TMP7]])
+; SCALAR_EPILOGUE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
+; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_EPILOGUE-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; SCALAR_EPILOGUE-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SCALAR_EPILOGUE:       middle.block:
+; SCALAR_EPILOGUE-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
+; SCALAR_EPILOGUE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SCALAR_EPILOGUE:       scalar.ph:
+;
+; PREDICATED_TAIL_FOLDING-LABEL: define void @masked_strided_factor2
+; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0:[0-9]+]] {
+; PREDICATED_TAIL_FOLDING-NEXT:  entry:
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PREDICATED_TAIL_FOLDING:       vector.ph:
+; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 3
+; PREDICATED_TAIL_FOLDING-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
+; PREDICATED_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 3
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i32> @llvm.stepvector.nxv8i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[CONV]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[TMP3]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[DOTSPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PREDICATED_TAIL_FOLDING:       vector.body:
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 [[INDEX]], i32 1024)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ugt <vscale x 8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i1> [[TMP5]], <vscale x 8 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 8 x i32> [[VEC_IND]], splat (i32 1)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = zext nneg <vscale x 8 x i32> [[TMP7]] to <vscale x 8 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP8]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP9]], i32 1, <vscale x 8 x i1> [[TMP6]], <vscale x 8 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 8 x i32> [[TMP7]], splat (i32 1)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 8 x i32> [[TMP10]] to <vscale x 8 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP11]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP12]], i32 1, <vscale x 8 x i1> [[TMP6]], <vscale x 8 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = call <vscale x 8 x i8> @llvm.smax.nxv8i8(<vscale x 8 x i8> [[WIDE_MASKED_GATHER]], <vscale x 8 x i8> [[WIDE_MASKED_GATHER1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 8 x i32> [[TMP7]] to <vscale x 8 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP14]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP13]], <vscale x 8 x ptr> [[TMP15]], i32 1, <vscale x 8 x i1> [[TMP6]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sub <vscale x 8 x i8> zeroinitializer, [[TMP13]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 8 x i32> [[TMP10]] to <vscale x 8 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP17]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP16]], <vscale x 8 x ptr> [[TMP18]], i32 1, <vscale x 8 x i1> [[TMP6]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PREDICATED_TAIL_FOLDING:       middle.block:
+; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_END:%.*]]
+; PREDICATED_TAIL_FOLDING:       scalar.ph:
+;
+; PREDICATED_EVL-LABEL: define void @masked_strided_factor2
+; PREDICATED_EVL-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0:[0-9]+]] {
+; PREDICATED_EVL-NEXT:  entry:
+; PREDICATED_EVL-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
+; PREDICATED_EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; PREDICATED_EVL:       for.body:
+; PREDICATED_EVL-NEXT:    [[IX_024:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
+; PREDICATED_EVL-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_024]], [[CONV]]
+; PREDICATED_EVL-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; PREDICATED_EVL:       if.then:
+; PREDICATED_EVL-NEXT:    [[MUL:%.*]] = shl nuw nsw i32 [[IX_024]], 1
+; PREDICATED_EVL-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[MUL]] to i64
+; PREDICATED_EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP0]]
+; PREDICATED_EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; PREDICATED_EVL-NEXT:    [[ADD:%.*]] = or disjoint i32 [[MUL]], 1
+; PREDICATED_EVL-NEXT:    [[TMP2:%.*]] = zext nneg i32 [[ADD]] to i64
+; PREDICATED_EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP2]]
+; PREDICATED_EVL-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; PREDICATED_EVL-NEXT:    [[SPEC_SELECT_I:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP1]], i8 [[TMP3]])
+; PREDICATED_EVL-NEXT:    [[TMP4:%.*]] = zext nneg i32 [[MUL]] to i64
+; PREDICATED_EVL-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP4]]
+; PREDICATED_EVL-NEXT:    store i8 [[SPEC_SELECT_I]], ptr [[ARRAYIDX6]], align 1
+; PREDICATED_EVL-NEXT:    [[SUB:%.*]] = sub i8 0, [[SPEC_SELECT_I]]
+; PREDICATED_EVL-NEXT:    [[TMP5:%.*]] = zext nneg i32 [[ADD]] to i64
+; PREDICATED_EVL-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP5]]
+; PREDICATED_EVL-NEXT:    store i8 [[SUB]], ptr [[ARRAYIDX11]], align 1
+; PREDICATED_EVL-NEXT:    br label [[FOR_INC]]
+; PREDICATED_EVL:       for.inc:
+; PREDICATED_EVL-NEXT:    [[INC]] = add nuw nsw i32 [[IX_024]], 1
+; PREDICATED_EVL-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1024
+; PREDICATED_EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; PREDICATED_EVL:       for.end:
+; PREDICATED_EVL-NEXT:    ret void
+;
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.024, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.024, 1
+  %arrayidx = getelementptr inbounds i8, ptr %p, i32 %mul
+  %0 = load i8, ptr %arrayidx, align 1
+  %add = or disjoint i32 %mul, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %p, i32 %add
+  %1 = load i8, ptr %arrayidx4, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx6 = getelementptr inbounds i8, ptr %q, i32 %mul
+  store i8 %spec.select.i, ptr %arrayidx6, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx11 = getelementptr inbounds i8, ptr %q, i32 %add
+  store i8 %sub, ptr %arrayidx11, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.024, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
+define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i8 zeroext %guard) {
+; SCALAR_EPILOGUE-LABEL: define void @masked_strided_factor4
+; SCALAR_EPILOGUE-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0]] {
+; SCALAR_EPILOGUE-NEXT:  entry:
+; SCALAR_EPILOGUE-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
+; SCALAR_EPILOGUE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_EPILOGUE-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 3
+; SCALAR_EPILOGUE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
+; SCALAR_EPILOGUE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALAR_EPILOGUE:       vector.ph:
+; SCALAR_EPILOGUE-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_EPILOGUE-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 3
+; SCALAR_EPILOGUE-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
+; SCALAR_EPILOGUE-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
+; SCALAR_EPILOGUE-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_EPILOGUE-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 3
+; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x i32> @llvm.stepvector.nxv8i32()
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[CONV]], i64 0
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; SCALAR_EPILOGUE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_EPILOGUE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[DOTSPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; SCALAR_EPILOGUE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALAR_EPILOGUE:       vector.body:
+; SCALAR_EPILOGUE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_EPILOGUE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_EPILOGUE-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SCALAR_EPILOGUE-NEXT:    [[TMP8:%.*]] = shl nuw nsw <vscale x 8 x i32> [[VEC_IND]], splat (i32 2)
+; SCALAR_EPILOGUE-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 8 x i32> [[TMP8]], splat (i32 1)
+; SCALAR_EPILOGUE-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 8 x i32> [[TMP8]], splat (i32 2)
+; SCALAR_EPILOGUE-NEXT:    [[TMP11:%.*]] = or disjoint <vscale x 8 x i32> [[TMP8]], splat (i32 3)
+; SCALAR_EPILOGUE-NEXT:    [[TMP12:%.*]] = zext nneg <vscale x 8 x i32> [[TMP8]] to <vscale x 8 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP12]]
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP13]], i32 1, <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 8 x i32> [[TMP9]] to <vscale x 8 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP14]]
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP15]], i32 1, <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP16:%.*]] = zext nneg <vscale x 8 x i32> [[TMP10]] to <vscale x 8 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP16]]
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP17]], i32 1, <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP18:%.*]] = zext nneg <vscale x 8 x i32> [[TMP11]] to <vscale x 8 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP18]]
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP19]], i32 1, <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP20:%.*]] = call <vscale x 8 x i8> @llvm.smax.nxv8i8(<vscale x 8 x i8> [[WIDE_MASKED_GATHER]], <vscale x 8 x i8> [[WIDE_MASKED_GATHER1]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP21:%.*]] = sub <vscale x 8 x i8> zeroinitializer, [[TMP20]]
+; SCALAR_EPILOGUE-NEXT:    [[TMP22:%.*]] = call <vscale x 8 x i8> @llvm.smax.nxv8i8(<vscale x 8 x i8> [[WIDE_MASKED_GATHER2]], <vscale x 8 x i8> [[WIDE_MASKED_GATHER3]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP23:%.*]] = sub <vscale x 8 x i8> zeroinitializer, [[TMP22]]
+; SCALAR_EPILOGUE-NEXT:    [[TMP24:%.*]] = zext nneg <vscale x 8 x i32> [[TMP8]] to <vscale x 8 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP24]]
+; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP20]], <vscale x 8 x ptr> [[TMP25]], i32 1, <vscale x 8 x i1> [[TMP7]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP26:%.*]] = zext nneg <vscale x 8 x i32> [[TMP9]] to <vscale x 8 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP26]]
+; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP21]], <vscale x 8 x ptr> [[TMP27]], i32 1, <vscale x 8 x i1> [[TMP7]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP28:%.*]] = zext nneg <vscale x 8 x i32> [[TMP10]] to <vscale x 8 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP28]]
+; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP22]], <vscale x 8 x ptr> [[TMP29]], i32 1, <vscale x 8 x i1> [[TMP7]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP30:%.*]] = zext nneg <vscale x 8 x i32> [[TMP11]] to <vscale x 8 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP30]]
+; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP23]], <vscale x 8 x ptr> [[TMP31]], i32 1, <vscale x 8 x i1> [[TMP7]])
+; SCALAR_EPILOGUE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
+; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_EPILOGUE-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; SCALAR_EPILOGUE-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; SCALAR_EPILOGUE:       middle.block:
+; SCALAR_EPILOGUE-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
+; SCALAR_EPILOGUE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SCALAR_EPILOGUE:       scalar.ph:
+;
+; PREDICATED_TAIL_FOLDING-LABEL: define void @masked_strided_factor4
+; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0]] {
+; PREDICATED_TAIL_FOLDING-NEXT:  entry:
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PREDICATED_TAIL_FOLDING:       vector.ph:
+; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 3
+; PREDICATED_TAIL_FOLDING-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
+; PREDICATED_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 3
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i32> @llvm.stepvector.nxv8i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[CONV]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[TMP3]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[DOTSPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PREDICATED_TAIL_FOLDING:       vector.body:
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 [[INDEX]], i32 1024)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ugt <vscale x 8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i1> [[TMP5]], <vscale x 8 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 8 x i32> [[VEC_IND]], splat (i32 2)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = or disjoint <vscale x 8 x i32> [[TMP7]], splat (i32 1)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 8 x i32> [[TMP7]], splat (i32 2)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 8 x i32> [[TMP7]], splat (i32 3)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 8 x i32> [[TMP7]] to <vscale x 8 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP11]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP12]], i32 1, <vscale x 8 x i1> [[TMP6]], <vscale x 8 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = zext nneg <vscale x 8 x i32> [[TMP8]] to <vscale x 8 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP13]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP14]], i32 1, <vscale x 8 x i1> [[TMP6]], <vscale x 8 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 8 x i32> [[TMP9]] to <vscale x 8 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP15]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP16]], i32 1, <vscale x 8 x i1> [[TMP6]], <vscale x 8 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 8 x i32> [[TMP10]] to <vscale x 8 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP17]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP18]], i32 1, <vscale x 8 x i1> [[TMP6]], <vscale x 8 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 8 x i8> @llvm.smax.nxv8i8(<vscale x 8 x i8> [[WIDE_MASKED_GATHER]], <vscale x 8 x i8> [[WIDE_MASKED_GATHER1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub <vscale x 8 x i8> zeroinitializer, [[TMP19]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = call <vscale x 8 x i8> @llvm.smax.nxv8i8(<vscale x 8 x i8> [[WIDE_MASKED_GATHER2]], <vscale x 8 x i8> [[WIDE_MASKED_GATHER3]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = sub <vscale x 8 x i8> zeroinitializer, [[TMP21]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = zext nneg <vscale x 8 x i32> [[TMP7]] to <vscale x 8 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP23]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP19]], <vscale x 8 x ptr> [[TMP24]], i32 1, <vscale x 8 x i1> [[TMP6]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP25:%.*]] = zext nneg <vscale x 8 x i32> [[TMP8]] to <vscale x 8 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP25]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP20]], <vscale x 8 x ptr> [[TMP26]], i32 1, <vscale x 8 x i1> [[TMP6]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP27:%.*]] = zext nneg <vscale x 8 x i32> [[TMP9]] to <vscale x 8 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP27]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP21]], <vscale x 8 x ptr> [[TMP28]], i32 1, <vscale x 8 x i1> [[TMP6]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP29:%.*]] = zext nneg <vscale x 8 x i32> [[TMP10]] to <vscale x 8 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP29]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP22]], <vscale x 8 x ptr> [[TMP30]], i32 1, <vscale x 8 x i1> [[TMP6]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; PREDICATED_TAIL_FOLDING:       middle.block:
+; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_END:%.*]]
+; PREDICATED_TAIL_FOLDING:       scalar.ph:
+;
+; PREDICATED_EVL-LABEL: define void @masked_strided_factor4
+; PREDICATED_EVL-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0]] {
+; PREDICATED_EVL-NEXT:  entry:
+; PREDICATED_EVL-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
+; PREDICATED_EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; PREDICATED_EVL:       for.body:
+; PREDICATED_EVL-NEXT:    [[IX_024:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
+; PREDICATED_EVL-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_024]], [[CONV]]
+; PREDICATED_EVL-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; PREDICATED_EVL:       if.then:
+; PREDICATED_EVL-NEXT:    [[IDX0:%.*]] = shl nuw nsw i32 [[IX_024]], 2
+; PREDICATED_EVL-NEXT:    [[IDX1:%.*]] = or disjoint i32 [[IDX0]], 1
+; PREDICATED_EVL-NEXT:    [[IDX2:%.*]] = or disjoint i32 [[IDX0]], 2
+; PREDICATED_EVL-NEXT:    [[IDX3:%.*]] = or disjoint i32 [[IDX0]], 3
+; PREDICATED_EVL-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[IDX0]] to i64
+; PREDICATED_EVL-NEXT:    [[ARRAY1IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP0]]
+; PREDICATED_EVL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAY1IDX0]], align 1
+; PREDICATED_EVL-NEXT:    [[TMP2:%.*]] = zext nneg i32 [[IDX1]] to i64
+; PREDICATED_EVL-NEXT:    [[ARRAY1IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP2]]
+; PREDICATED_EVL-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAY1IDX1]], align 1
+; PREDICATED_EVL-NEXT:    [[TMP4:%.*]] = zext nneg i32 [[IDX2]] to i64
+; PREDICATED_EVL-NEXT:    [[ARRAY1IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP4]]
+; PREDICATED_EVL-NEXT:    [[TMP5:%.*]] = load i8, ptr [[ARRAY1IDX2]], align 1
+; PREDICATED_EVL-NEXT:    [[TMP6:%.*]] = zext nneg i32 [[IDX3]] to i64
+; PREDICATED_EVL-NEXT:    [[ARRAY1IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP6]]
+; PREDICATED_EVL-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAY1IDX3]], align 1
+; PREDICATED_EVL-NEXT:    [[SPEC_SELECT_I1:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP1]], i8 [[TMP3]])
+; PREDICATED_EVL-NEXT:    [[SUB1:%.*]] = sub i8 0, [[SPEC_SELECT_I1]]
+; PREDICATED_EVL-NEXT:    [[SPEC_SELECT_I2:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP5]], i8 [[TMP7]])
+; PREDICATED_EVL-NEXT:    [[SUB2:%.*]] = sub i8 0, [[SPEC_SELECT_I2]]
+; PREDICATED_EVL-NEXT:    [[TMP8:%.*]] = zext nneg i32 [[IDX0]] to i64
+; PREDICATED_EVL-NEXT:    [[ARRAY3IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP8]]
+; PREDICATED_EVL-NEXT:    store i8 [[SPEC_SELECT_I1]], ptr [[ARRAY3IDX0]], align 1
+; PREDICATED_EVL-NEXT:    [[TMP9:%.*]] = zext nneg i32 [[IDX1]] to i64
+; PREDICATED_EVL-NEXT:    [[ARRAY3IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP9]]
+; PREDICATED_EVL-NEXT:    store i8 [[SUB1]], ptr [[ARRAY3IDX1]], align 1
+; PREDICATED_EVL-NEXT:    [[TMP10:%.*]] = zext nneg i32 [[IDX2]] to i64
+; PREDICATED_EVL-NEXT:    [[ARRAY3IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP10]]
+; PREDICATED_EVL-NEXT:    store i8 [[SPEC_SELECT_I2]], ptr [[ARRAY3IDX2]], align 1
+; PREDICATED_EVL-NEXT:    [[TMP11:%.*]] = zext nneg i32 [[IDX3]] to i64
+; PREDICATED_EVL-NEXT:    [[ARRAY3IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP11]]
+; PREDICATED_EVL-NEXT:    store i8 [[SUB2]], ptr [[ARRAY3IDX3]], align 1
+; PREDICATED_EVL-NEXT:    br label [[FOR_INC]]
+; PREDICATED_EVL:       for.inc:
+; PREDICATED_EVL-NEXT:    [[INC]] = add nuw nsw i32 [[IX_024]], 1
+; PREDICATED_EVL-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1024
+; PREDICATED_EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; PREDICATED_EVL:       for.end:
+; PREDICATED_EVL-NEXT:    ret void
+;
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.024, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %idx0 = shl nuw nsw i32 %ix.024, 2
+  %idx1 = add i32 %idx0, 1
+  %idx2 = add i32 %idx0, 2
+  %idx3 = add i32 %idx0, 3
+
+  %array1idx0 = getelementptr inbounds i8, ptr %p, i32 %idx0
+  %0 = load i8, ptr %array1idx0, align 1
+  %array1idx1 = getelementptr inbounds i8, ptr %p, i32 %idx1
+  %1 = load i8, ptr %array1idx1, align 1
+  %array1idx2 = getelementptr inbounds i8, ptr %p, i32 %idx2
+  %2 = load i8, ptr %array1idx2, align 1
+  %array1idx3 = getelementptr inbounds i8, ptr %p, i32 %idx3
+  %3 = load i8, ptr %array1idx3, align 1
+
+  %cmp.i1 = icmp slt i8 %0, %1
+  %spec.select.i1 = select i1 %cmp.i1, i8 %1, i8 %0
+  %sub1 = sub i8 0, %spec.select.i1
+  %cmp.i2 = icmp slt i8 %2, %3
+  %spec.select.i2 = select i1 %cmp.i2, i8 %3, i8 %2
+  %sub2 = sub i8 0, %spec.select.i2
+
+  %array3idx0 = getelementptr inbounds i8, ptr %q, i32 %idx0
+  store i8 %spec.select.i1, ptr %array3idx0, align 1
+  %array3idx1 = getelementptr inbounds i8, ptr %q, i32 %idx1
+  store i8 %sub1, ptr %array3idx1, align 1
+  %array3idx2 = getelementptr inbounds i8, ptr %q, i32 %idx2
+  store i8 %spec.select.i2, ptr %array3idx2, align 1
+  %array3idx3 = getelementptr inbounds i8, ptr %q, i32 %idx3
+  store i8 %sub2, ptr %array3idx3, align 1
+
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.024, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
index 01a7ea4..3f17c95 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
@@ -277,9 +277,9 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-OUTLOOP-NEXT:    [[TMP9:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
 ; IF-EVL-OUTLOOP-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; IF-EVL-OUTLOOP-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-OUTLOOP-NEXT:    [[TMP9:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
 ; IF-EVL-OUTLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL-OUTLOOP:       vector.body:
 ; IF-EVL-OUTLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -581,8 +581,8 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-VP-OUTLOOP-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; NO-VP-OUTLOOP-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; NO-VP-OUTLOOP-NEXT:    [[TMP11:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
+; NO-VP-OUTLOOP-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; NO-VP-OUTLOOP-NEXT:    [[TMP14:%.*]] = mul <vscale x 4 x i32> [[TMP12]], splat (i32 1)
 ; NO-VP-OUTLOOP-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP14]]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP10]] to i32
@@ -771,8 +771,8 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-VP-OUTLOOP-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; NO-VP-OUTLOOP-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; NO-VP-OUTLOOP-NEXT:    [[TMP11:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
+; NO-VP-OUTLOOP-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; NO-VP-OUTLOOP-NEXT:    [[TMP14:%.*]] = mul <vscale x 4 x i32> [[TMP12]], splat (i32 1)
 ; NO-VP-OUTLOOP-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP14]]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP10]] to i32
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
index 2e50c02..af36f18 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
@@ -137,9 +137,9 @@ define i32 @mul(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> splat (i32 1), i32 [[START:%.*]], i32 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
-; IF-EVL-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> splat (i32 1), i32 [[START:%.*]], i32 0
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -1220,9 +1220,9 @@ define float @fmul(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT:    [[TMP9:%.*]] = insertelement <8 x float> splat (float 1.000000e+00), float [[START:%.*]], i32 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
-; IF-EVL-NEXT:    [[TMP9:%.*]] = insertelement <8 x float> splat (float 1.000000e+00), float [[START:%.*]], i32 0
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index 8df21f30..79f490a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -37,13 +37,14 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT: Successor(s): scalar.ph, vector.ph
 ; IF-EVL-OUTLOOP-EMPTY:
 ; IF-EVL-OUTLOOP-NEXT: vector.ph:
+; IF-EVL-OUTLOOP-NEXT:  EMIT vp<[[RDX_START:%.]]> = reduction-start-vector ir<%start>, ir<0>, ir<1>
 ; IF-EVL-OUTLOOP-NEXT: Successor(s): vector loop
 ; IF-EVL-OUTLOOP-EMPTY:
 ; IF-EVL-OUTLOOP-NEXT: <x1> vector loop: {
 ; IF-EVL-OUTLOOP-NEXT:  vector.body:
 ; IF-EVL-OUTLOOP-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
 ; IF-EVL-OUTLOOP-NEXT:    EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
-; IF-EVL-OUTLOOP-NEXT:    WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, vp<[[RDX_SELECT:%.+]]>
+; IF-EVL-OUTLOOP-NEXT:    WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_SELECT:%.+]]>
 ; IF-EVL-OUTLOOP-NEXT:    EMIT vp<[[AVL:%.+]]> = sub ir<%n>, vp<[[EVL_PHI]]>
 ; IF-EVL-OUTLOOP-NEXT:    EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
 ; IF-EVL-OUTLOOP-NEXT:    vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[VF]]>
@@ -77,13 +78,14 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT: Live-in ir<%n> = original trip-count
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP:      vector.ph:
+; IF-EVL-INLOOP-NEXT:   EMIT vp<[[RDX_START:%.]]> = reduction-start-vector ir<%start>, ir<0>, ir<1>
 ; IF-EVL-INLOOP-NEXT: Successor(s): vector loop
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP-NEXT: <x1> vector loop: {
 ; IF-EVL-INLOOP-NEXT:  vector.body:
 ; IF-EVL-INLOOP-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
 ; IF-EVL-INLOOP-NEXT:    EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
-; IF-EVL-INLOOP-NEXT:    WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]>
+; IF-EVL-INLOOP-NEXT:    WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi vp<[[RDX_START]]>, ir<[[RDX_NEXT:%.+]]>
 ; IF-EVL-INLOOP-NEXT:    EMIT vp<[[AVL:%.+]]> = sub ir<%n>, vp<[[EVL_PHI]]>
 ; IF-EVL-INLOOP-NEXT:    EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
 ; IF-EVL-INLOOP-NEXT:    vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>, vp<[[VF]]>
@@ -116,12 +118,13 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT: Live-in ir<%n> = original trip-count
 ; NO-VP-OUTLOOP-EMPTY:
 ; NO-VP-OUTLOOP:      vector.ph:
+; NO-VP-OUTLOOP-NEXT:   EMIT vp<[[RDX_START:%.]]> = reduction-start-vector ir<%start>, ir<0>, ir<1>
 ; NO-VP-OUTLOOP-NEXT: Successor(s): vector loop
 ; NO-VP-OUTLOOP-EMPTY:
 ; NO-VP-OUTLOOP-NEXT: <x1> vector loop: {
 ; NO-VP-OUTLOOP-NEXT:  vector.body:
 ; NO-VP-OUTLOOP-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; NO-VP-OUTLOOP-NEXT:    WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]>
+; NO-VP-OUTLOOP-NEXT:    WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi vp<[[RDX_START]]>, ir<[[RDX_NEXT:%.+]]>
 ; NO-VP-OUTLOOP-NEXT:    vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>, vp<[[VF]]>
 ; NO-VP-OUTLOOP-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; NO-VP-OUTLOOP-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
@@ -164,12 +167,13 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-NEXT: Live-in ir<%n> = original trip-count
 ; NO-VP-INLOOP-EMPTY:
 ; NO-VP-INLOOP:      vector.ph:
+; NO-VP-INLOOP-NEXT:   EMIT vp<[[RDX_START:%.]]> = reduction-start-vector ir<%start>, ir<0>, ir<1>
 ; NO-VP-INLOOP-NEXT: Successor(s): vector loop
 ; NO-VP-INLOOP-EMPTY:
 ; NO-VP-INLOOP-NEXT: <x1> vector loop: {
 ; NO-VP-INLOOP-NEXT:  vector.body:
 ; NO-VP-INLOOP-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; NO-VP-INLOOP-NEXT:    WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]>
+; NO-VP-INLOOP-NEXT:    WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi vp<[[RDX_START]]>, ir<[[RDX_NEXT:%.+]]>
 ; NO-VP-INLOOP-NEXT:    vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>, vp<[[VF]]>
 ; NO-VP-INLOOP-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; NO-VP-INLOOP-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 7c42c3d..2c6fe4f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -1271,12 +1271,12 @@ define i32 @g(i64 %n) {
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP20]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF7:%.*]] = urem i32 [[TMP1]], 4
 ; CHECK-NEXT:    [[N_VEC8:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF7]]
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT14:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT9]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX9:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
index 04271ff3..2cda253 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
@@ -33,7 +33,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; CHECK: Cost of 1 for VF 2: exit condition instruction   %exitcond = icmp eq i32 %lftr.wideiv, %n
 ; CHECK: Cost of 0 for VF 2: exit condition instruction   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
 ; CHECK: Cost of 0 for VF 2: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost of 1 for VF 2: WIDEN-REDUCTION-PHI ir<%sum.013> = phi ir<0>, vp<[[EXT:%.+]]>
+; CHECK: Cost of 1 for VF 2: WIDEN-REDUCTION-PHI ir<%sum.013> = phi vp<{{.+}}>, vp<[[EXT:%.+]]>
 ; CHECK: Cost of 0 for VF 2: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]>
 ; CHECK: Cost of 0 for VF 2: vp<[[VECP1:%.+]]> = vector-pointer ir<%arrayidx>
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll
index 800b6f3..6bf8883 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll
@@ -44,16 +44,16 @@ define i64 @select_icmp_const(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = select i1 [[TMP14]], i64 -9223372036854775808, i64 [[BC_MERGE_RDX1]]
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[N]], 4
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_MERGE_RDX]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], <i64 0, i64 1, i64 2, i64 3>
-; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i64> poison, i64 [[BC_MERGE_RDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT8]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT9]], <i64 0, i64 1, i64 2, i64 3>
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND5:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i64> [ [[DOTSPLAT9]], %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i64> [ [[DOTSPLAT]], %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
@@ -70,12 +70,12 @@ define i64 @select_icmp_const(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[CMP_N12:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N12]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL15:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX16:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 3, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX14:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 3, %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX16]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL13]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX14]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
 ; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i64 [[L]], 3
@@ -148,16 +148,16 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = select i1 [[TMP14]], i64 -9223372036854775808, i64 [[BC_MERGE_RDX1]]
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[N]], 4
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_MERGE_RDX]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], <i64 0, i64 1, i64 2, i64 3>
-; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i64> poison, i64 [[BC_MERGE_RDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT8]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT9]], <i64 0, i64 1, i64 2, i64 3>
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND5:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i64> [ [[DOTSPLAT9]], %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i64> [ [[DOTSPLAT]], %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP9]], align 4
@@ -174,12 +174,12 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[CMP_N12:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N12]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL15:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX16:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 2, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX14:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 2, %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX16]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL13]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX14]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
 ; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[C:%.*]] = fcmp fast ueq float [[L]], 3.000000e+00
@@ -261,16 +261,16 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i32 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF2]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = trunc i32 [[N_VEC3]] to i8
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i8> [[DOTSPLAT]], <i8 0, i8 1, i8 2, i8 3>
-; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT8]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i8> [[DOTSPLAT9]], <i8 0, i8 1, i8 2, i8 3>
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX4:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND5:%.*]] = phi <4 x i8> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i8> [ [[DOTSPLAT9]], %[[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i8> [ [[DOTSPLAT]], %[[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX4]] to i8
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
@@ -288,12 +288,12 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[CMP_N14:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N14]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL15:%.*]] = phi i8 [ [[TMP13]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX16:%.*]] = phi i8 [ [[RDX_SELECT13]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i8 [ [[TMP13]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX14:%.*]] = phi i8 [ [[RDX_SELECT13]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i8 [ [[BC_MERGE_RDX16]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL13]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i8 [ [[BC_MERGE_RDX14]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[IV]]
 ; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 3
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index 0b5074b..9d2719a 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -189,6 +189,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<1234>, ir<-1>, ir<1>
 ; CHECK-NEXT:   WIDEN-CAST ir<%recur.next> = sext ir<%y> to i32
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -196,7 +197,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT: vector.body:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
 ; CHECK-NEXT:   FIRST-ORDER-RECURRENCE-PHI ir<%recur> = phi ir<0>, ir<%recur.next>
-; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%and.red> = phi ir<1234>, ir<%and.red.next>
+; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%and.red> = phi vp<[[RDX_START]]>, ir<%and.red.next>
 ; CHECK-NEXT:   EMIT vp<[[WIDEN_CAN:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[CAN_IV]]>
 ; CHECK-NEXT:   EMIT vp<[[MASK:%.+]]> = icmp ule vp<[[WIDEN_CAN]]>, vp<[[BTC]]>
 ; CHECK-NEXT:   EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%recur>, ir<%recur.next>
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index 5837a49..5c1f628b 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -313,9 +313,9 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; VEC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 2
 ; VEC-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
 ; VEC-NEXT:    [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]]
+; VEC-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[V_2:%.*]], i32 0
 ; VEC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[COND_2:%.*]], i64 0
 ; VEC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
-; VEC-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[V_2:%.*]], i32 0
 ; VEC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC:       vector.body:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 1517ec8..5a5b06d 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -3117,9 +3117,9 @@ define i32 @testoverflowcheck() {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP3]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
 ; CHECK-NEXT:    [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[DOTCAST]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> splat (i32 -1), i32 [[C_PROMOTED_I]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> splat (i32 -1), i32 [[C_PROMOTED_I]], i32 0
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -3161,9 +3161,9 @@ define i32 @testoverflowcheck() {
 ; IND-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP3]], 510
 ; IND-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
 ; IND-NEXT:    [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[DOTCAST]]
+; IND-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> <i32 poison, i32 -1>, i32 [[C_PROMOTED_I]], i64 0
 ; IND-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
 ; IND-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; IND-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> <i32 poison, i32 -1>, i32 [[C_PROMOTED_I]], i64 0
 ; IND-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IND:       vector.body:
 ; IND-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -3203,9 +3203,9 @@ define i32 @testoverflowcheck() {
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP3]], 508
 ; UNROLL-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
 ; UNROLL-NEXT:    [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[DOTCAST]]
+; UNROLL-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> <i32 poison, i32 -1>, i32 [[C_PROMOTED_I]], i64 0
 ; UNROLL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
 ; UNROLL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> <i32 poison, i32 -1>, i32 [[C_PROMOTED_I]], i64 0
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -3246,9 +3246,9 @@ define i32 @testoverflowcheck() {
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP3]], [[N_MOD_VF]]
 ; UNROLL-NO-IC-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
 ; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[DOTCAST]]
+; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> splat (i32 -1), i32 [[C_PROMOTED_I]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> splat (i32 -1), i32 [[C_PROMOTED_I]], i32 0
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -3293,9 +3293,9 @@ define i32 @testoverflowcheck() {
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP3]], 504
 ; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8
 ; INTERLEAVE-NEXT:    [[IND_END:%.*]] = add i8 [[DOTPR_I]], [[DOTCAST]]
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> <i32 poison, i32 -1, i32 -1, i32 -1>, i32 [[C_PROMOTED_I]], i64 0
 ; INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
 ; INTERLEAVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; INTERLEAVE-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> <i32 poison, i32 -1, i32 -1, i32 -1>, i32 [[C_PROMOTED_I]], i64 0
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index d57e7fa..95fbc42 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -16,12 +16,13 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
 ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector fast ir<0.000000e+00>, ir<0.000000e+00>, ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
 ; CHECK-NEXT: vector.body:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%red> = phi ir<0.000000e+00>, ir<%red.next>
+; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%red> = phi vp<[[RDX_START]]>, ir<%red.next>
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
 ; CHECK-NEXT:   CLONE ir<%arrayidx> = getelementptr inbounds ir<%y>, vp<[[STEPS]]>
 ; CHECK-NEXT:   vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
@@ -84,12 +85,13 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no
 ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector fast ir<0.000000e+00>, ir<0.000000e+00>, ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
 ; CHECK-NEXT: vector.body:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%red> = phi ir<0.000000e+00>, ir<%red.next>
+; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%red> = phi vp<[[RDX_START]]>, ir<%red.next>
 ; CHECK-NEXT:   vp<[[IV:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
 ; CHECK-NEXT:   CLONE ir<%arrayidx> = getelementptr inbounds ir<%y>, vp<[[IV]]>
 ; CHECK-NEXT:   vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
@@ -153,12 +155,13 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector nnan ninf nsz ir<0.000000e+00>, ir<0.000000e+00>, ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
 ; CHECK-NEXT: vector.body:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%sum.07> = phi ir<0.000000e+00>, ir<%muladd>
+; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%sum.07> = phi vp<[[RDX_START]]>, ir<%muladd>
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
 ; CHECK-NEXT:   CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]>
 ; CHECK-NEXT:   vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
@@ -278,12 +281,13 @@ define i64 @print_extended_reduction(ptr nocapture readonly %x, ptr nocapture re
 ; CHECK-NEXT: Live-in ir<%n> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK:      vector.ph:
+; CHECK-NEXT:   EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
 ; CHECK-NEXT:   vector.body:
 ; CHECK-NEXT:     EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
-; CHECK-NEXT:     WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, ir<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT:     WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, ir<[[RDX_NEXT:%.+]]>
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
 ; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
 ; CHECK-NEXT:     vp<[[ADDR:%.+]]> = vector-pointer ir<%arrayidx>
@@ -322,12 +326,13 @@ define i64 @print_mulacc(ptr nocapture readonly %x, ptr nocapture readonly %y, i
 ; CHECK-NEXT: Live-in ir<%n> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK:      vector.ph:
+; CHECK-NEXT:   EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
 ; CHECK-NEXT:   vector.body:
 ; CHECK-NEXT:     EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
-; CHECK-NEXT:     WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, ir<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT:     WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, ir<[[RDX_NEXT:%.+]]>
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
 ; CHECK-NEXT:     CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
 ; CHECK-NEXT:     vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]>
@@ -371,12 +376,13 @@ define i64 @print_mulacc_extended(ptr nocapture readonly %x, ptr nocapture reado
 ; CHECK-NEXT: Live-in ir<%n> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK:      vector.ph:
+; CHECK-NEXT:   EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
 ; CHECK-NEXT:   vector.body:
 ; CHECK-NEXT:     EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
-; CHECK-NEXT:     WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, ir<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT:     WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, ir<[[RDX_NEXT:%.+]]>
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
 ; CHECK-NEXT:     CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
 ; CHECK-NEXT:     vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]>
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll
index 9160ced..265a142 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll
@@ -11,17 +11,15 @@ define void @add_2x2(ptr %lhs, ptr %rhs, ptr %out) {
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[COL_LOAD]], [[COL_LOAD2]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]]
-; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
-; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %lhsv = load <4 x i32>, ptr %lhs
   %rhsv = load <4 x i32>, ptr %rhs
   %op = add <4 x i32> %lhsv, %rhsv
-  %opt  = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2)
-  %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2)
-  store <4 x i32> %optt, ptr %out
+  call void @llvm.matrix.column.major.store(<4 x i32> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
   ret void
 }
 
@@ -35,17 +33,15 @@ define void @fadd_2x2(ptr %lhs, ptr %rhs, ptr %out) {
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x float>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x float> [[COL_LOAD]], [[COL_LOAD2]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[COL_LOAD1]], [[COL_LOAD4]]
-; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr float, ptr [[OUT]], i64 2
-; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP5]], align 8
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %lhsv = load <4 x float>, ptr %lhs
   %rhsv = load <4 x float>, ptr %rhs
   %op = fadd <4 x float> %lhsv, %rhsv
-  %opt  = call <4 x float> @llvm.matrix.transpose(<4 x float> %op, i32 2, i32 2)
-  %optt = call <4 x float> @llvm.matrix.transpose(<4 x float> %opt, i32 2, i32 2)
-  store <4 x float> %optt, ptr %out
+  call void @llvm.matrix.column.major.store(<4 x float> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
   ret void
 }
 
@@ -59,17 +55,15 @@ define void @sub_2x2(ptr %lhs, ptr %rhs, ptr %out) {
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub <2 x i32> [[COL_LOAD]], [[COL_LOAD2]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]]
-; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
-; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %lhsv = load <4 x i32>, ptr %lhs
   %rhsv = load <4 x i32>, ptr %rhs
   %op = sub <4 x i32> %lhsv, %rhsv
-  %opt  = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2)
-  %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2)
-  store <4 x i32> %optt, ptr %out
+  call void @llvm.matrix.column.major.store(<4 x i32> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
   ret void
 }
 
@@ -83,17 +77,15 @@ define void @fsub_2x2(ptr %lhs, ptr %rhs, ptr %out) {
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x float>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub nnan <2 x float> [[COL_LOAD]], [[COL_LOAD2]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fsub nnan <2 x float> [[COL_LOAD1]], [[COL_LOAD4]]
-; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr float, ptr [[OUT]], i64 2
-; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP5]], align 8
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %lhsv = load <4 x float>, ptr %lhs
   %rhsv = load <4 x float>, ptr %rhs
   %op = fsub nnan <4 x float> %lhsv, %rhsv
-  %opt  = call <4 x float> @llvm.matrix.transpose(<4 x float> %op, i32 2, i32 2)
-  %optt = call <4 x float> @llvm.matrix.transpose(<4 x float> %opt, i32 2, i32 2)
-  store <4 x float> %optt, ptr %out
+  call void @llvm.matrix.column.major.store(<4 x float> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
   ret void
 }
 
@@ -107,17 +99,15 @@ define void @mul_2x2(ptr %lhs, ptr %rhs, ptr %out) {
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul <2 x i32> [[COL_LOAD]], [[COL_LOAD2]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]]
-; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
-; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %lhsv = load <4 x i32>, ptr %lhs
   %rhsv = load <4 x i32>, ptr %rhs
   %op = mul <4 x i32> %lhsv, %rhsv
-  %opt  = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2)
-  %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2)
-  store <4 x i32> %optt, ptr %out
+  call void @llvm.matrix.column.major.store(<4 x i32> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
   ret void
 }
 
@@ -131,17 +121,15 @@ define void @fmul_2x2(ptr %lhs, ptr %rhs, ptr %out) {
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x float>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul contract <2 x float> [[COL_LOAD]], [[COL_LOAD2]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul contract <2 x float> [[COL_LOAD1]], [[COL_LOAD4]]
-; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr float, ptr [[OUT]], i64 2
-; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP5]], align 8
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %lhsv = load <4 x float>, ptr %lhs
   %rhsv = load <4 x float>, ptr %rhs
   %op = fmul contract <4 x float> %lhsv, %rhsv
-  %opt  = call <4 x float> @llvm.matrix.transpose(<4 x float> %op, i32 2, i32 2)
-  %optt = call <4 x float> @llvm.matrix.transpose(<4 x float> %opt, i32 2, i32 2)
-  store <4 x float> %optt, ptr %out
+  call void @llvm.matrix.column.major.store(<4 x float> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
   ret void
 }
 
@@ -155,17 +143,15 @@ define void @udiv_2x2(ptr %lhs, ptr %rhs, ptr %out) {
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = udiv <2 x i32> [[COL_LOAD]], [[COL_LOAD2]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]]
-; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
-; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %lhsv = load <4 x i32>, ptr %lhs
   %rhsv = load <4 x i32>, ptr %rhs
   %op = udiv <4 x i32> %lhsv, %rhsv
-  %opt  = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2)
-  %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2)
-  store <4 x i32> %optt, ptr %out
+  call void @llvm.matrix.column.major.store(<4 x i32> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
   ret void
 }
 
@@ -179,17 +165,15 @@ define void @sdiv_2x2(ptr %lhs, ptr %rhs, ptr %out) {
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = sdiv <2 x i32> [[COL_LOAD]], [[COL_LOAD2]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = sdiv <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]]
-; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
-; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %lhsv = load <4 x i32>, ptr %lhs
   %rhsv = load <4 x i32>, ptr %rhs
   %op = sdiv <4 x i32> %lhsv, %rhsv
-  %opt  = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2)
-  %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2)
-  store <4 x i32> %optt, ptr %out
+  call void @llvm.matrix.column.major.store(<4 x i32> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
   ret void
 }
 
@@ -203,17 +187,15 @@ define void @fdiv_2x2(ptr %num, ptr %denom, ptr %out) {
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = fdiv nnan <2 x double> [[COL_LOAD]], [[COL_LOAD2]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fdiv nnan <2 x double> [[COL_LOAD1]], [[COL_LOAD4]]
-; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 32
+; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 8
 ; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[OUT]], i64 2
-; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[VEC_GEP5]], align 16
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[VEC_GEP5]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %numv = load <4 x double>, ptr %num
   %denomv = load <4 x double>, ptr %denom
-  %div = fdiv nnan <4 x double> %numv, %denomv
-  %divt  = call <4 x double> @llvm.matrix.transpose(<4 x double> %div, i32 2, i32 2)
-  %divtt = call <4 x double> @llvm.matrix.transpose(<4 x double> %divt, i32 2, i32 2)
-  store <4 x double> %divtt, ptr %out
+  %op = fdiv nnan <4 x double> %numv, %denomv
+  call void @llvm.matrix.column.major.store(<4 x double> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
   ret void
 }
 
@@ -227,17 +209,15 @@ define void @urem_2x2(ptr %lhs, ptr %rhs, ptr %out) {
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = urem <2 x i32> [[COL_LOAD]], [[COL_LOAD2]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = urem <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]]
-; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
-; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %lhsv = load <4 x i32>, ptr %lhs
   %rhsv = load <4 x i32>, ptr %rhs
   %op = urem <4 x i32> %lhsv, %rhsv
-  %opt  = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2)
-  %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2)
-  store <4 x i32> %optt, ptr %out
+  call void @llvm.matrix.column.major.store(<4 x i32> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
   ret void
 }
 
@@ -251,17 +231,15 @@ define void @srem_2x2(ptr %lhs, ptr %rhs, ptr %out) {
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = srem <2 x i32> [[COL_LOAD]], [[COL_LOAD2]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = srem <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]]
-; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
-; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %lhsv = load <4 x i32>, ptr %lhs
   %rhsv = load <4 x i32>, ptr %rhs
   %op = srem <4 x i32> %lhsv, %rhsv
-  %opt  = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2)
-  %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2)
-  store <4 x i32> %optt, ptr %out
+  call void @llvm.matrix.column.major.store(<4 x i32> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
   ret void
 }
 
@@ -275,17 +253,15 @@ define void @frem_2x2(ptr %lhs, ptr %rhs, ptr %out) {
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x float>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = frem fast <2 x float> [[COL_LOAD]], [[COL_LOAD2]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = frem fast <2 x float> [[COL_LOAD1]], [[COL_LOAD4]]
-; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr float, ptr [[OUT]], i64 2
-; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP5]], align 8
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %lhsv = load <4 x float>, ptr %lhs
   %rhsv = load <4 x float>, ptr %rhs
   %op = frem fast <4 x float> %lhsv, %rhsv
-  %opt  = call <4 x float> @llvm.matrix.transpose(<4 x float> %op, i32 2, i32 2)
-  %optt = call <4 x float> @llvm.matrix.transpose(<4 x float> %opt, i32 2, i32 2)
-  store <4 x float> %optt, ptr %out
+  call void @llvm.matrix.column.major.store(<4 x float> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
   ret void
 }
 
@@ -299,17 +275,15 @@ define void @shl_2x2(ptr %lhs, ptr %rhs, ptr %out) {
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i32> [[COL_LOAD]], [[COL_LOAD2]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]]
-; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
-; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %lhsv = load <4 x i32>, ptr %lhs
   %rhsv = load <4 x i32>, ptr %rhs
   %op = shl <4 x i32> %lhsv, %rhsv
-  %opt  = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2)
-  %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2)
-  store <4 x i32> %optt, ptr %out
+  call void @llvm.matrix.column.major.store(<4 x i32> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
   ret void
 }
 
@@ -323,17 +297,15 @@ define void @lshr_2x2(ptr %lhs, ptr %rhs, ptr %out) {
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i32> [[COL_LOAD]], [[COL_LOAD2]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]]
-; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
-; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %lhsv = load <4 x i32>, ptr %lhs
   %rhsv = load <4 x i32>, ptr %rhs
   %op = lshr <4 x i32> %lhsv, %rhsv
-  %opt  = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2)
-  %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2)
-  store <4 x i32> %optt, ptr %out
+  call void @llvm.matrix.column.major.store(<4 x i32> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
   ret void
 }
 
@@ -347,17 +319,15 @@ define void @ashr_2x2(ptr %lhs, ptr %rhs, ptr %out) {
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i32> [[COL_LOAD]], [[COL_LOAD2]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = ashr <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]]
-; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
-; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %lhsv = load <4 x i32>, ptr %lhs
   %rhsv = load <4 x i32>, ptr %rhs
   %op = ashr <4 x i32> %lhsv, %rhsv
-  %opt  = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2)
-  %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2)
-  store <4 x i32> %optt, ptr %out
+  call void @llvm.matrix.column.major.store(<4 x i32> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
   ret void
 }
 
@@ -371,17 +341,15 @@ define void @and_2x2(ptr %lhs, ptr %rhs, ptr %out) {
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[COL_LOAD]], [[COL_LOAD2]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]]
-; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
-; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %lhsv = load <4 x i32>, ptr %lhs
   %rhsv = load <4 x i32>, ptr %rhs
   %op = and <4 x i32> %lhsv, %rhsv
-  %opt  = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2)
-  %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2)
-  store <4 x i32> %optt, ptr %out
+  call void @llvm.matrix.column.major.store(<4 x i32> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
   ret void
 }
 
@@ -395,17 +363,15 @@ define void @or_2x2(ptr %lhs, ptr %rhs, ptr %out) {
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = or <2 x i32> [[COL_LOAD]], [[COL_LOAD2]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]]
-; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
-; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %lhsv = load <4 x i32>, ptr %lhs
   %rhsv = load <4 x i32>, ptr %rhs
   %op = or <4 x i32> %lhsv, %rhsv
-  %opt  = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2)
-  %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2)
-  store <4 x i32> %optt, ptr %out
+  call void @llvm.matrix.column.major.store(<4 x i32> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
   ret void
 }
 
@@ -419,16 +385,50 @@ define void @xor_2x2(ptr %lhs, ptr %rhs, ptr %out) {
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i32> [[COL_LOAD]], [[COL_LOAD2]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]]
-; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
-; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %lhsv = load <4 x i32>, ptr %lhs
   %rhsv = load <4 x i32>, ptr %rhs
   %op = xor <4 x i32> %lhsv, %rhsv
-  %opt  = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2)
-  %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2)
-  store <4 x i32> %optt, ptr %out
+  call void @llvm.matrix.column.major.store(<4 x i32> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
+  ret void
+}
+
+define void @fabs_2x2f64(ptr %in, ptr %out) {
+; CHECK-LABEL: @fabs_2x2f64(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[IN:%.*]], align 32
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[COL_LOAD]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[COL_LOAD1]])
+; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    ret void
+;
+  %load = load <4 x double>, ptr %in
+  %op = call <4 x double> @llvm.fabs.v4f64(<4 x double> %load)
+  call void @llvm.matrix.column.major.store(<4 x double> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
+  ret void
+}
+
+define void @abs_2x2i32(ptr %in, ptr %out) {
+; CHECK-LABEL: @abs_2x2i32(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x i32>, ptr [[IN:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i32, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x i32>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[COL_LOAD]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[COL_LOAD1]], i1 false)
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 4
+; CHECK-NEXT:    ret void
+;
+  %load = load <4 x i32>, ptr %in
+  %op = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %load, i1 false)
+  call void @llvm.matrix.column.major.store(<4 x i32> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
   ret void
 }
diff --git a/llvm/test/Transforms/LowerTypeTests/export-inline.ll b/llvm/test/Transforms/LowerTypeTests/export-inline.ll
index 956f0e3..72275b9 100644
--- a/llvm/test/Transforms/LowerTypeTests/export-inline.ll
+++ b/llvm/test/Transforms/LowerTypeTests/export-inline.ll
@@ -1,8 +1,13 @@
-; RUN: opt -mtriple=x86_64-unknown-linux -S -passes=lowertypetests -lowertypetests-summary-action=export -lowertypetests-read-summary=%S/Inputs/use-typeid1-typeid2.yaml -lowertypetests-write-summary=%t %s | FileCheck --check-prefix=CHECK %s
-; RUN: FileCheck --check-prefixes=SUMMARY,SUMMARY-X86 %s < %t
+; REQUIRES: x86-registered-target
 
-; RUN: opt -mtriple=aarch64-unknown-linux -S -passes=lowertypetests -lowertypetests-summary-action=export -lowertypetests-read-summary=%S/Inputs/use-typeid1-typeid2.yaml -lowertypetests-write-summary=%t %s | FileCheck --check-prefix=CHECK %s
-; RUN: FileCheck --check-prefixes=SUMMARY,SUMMARY-ARM %s < %t
+; RUN: opt -mtriple=i686-unknown-linux -S -passes=lowertypetests -lowertypetests-summary-action=export -lowertypetests-read-summary=%S/Inputs/use-typeid1-typeid2.yaml -lowertypetests-write-summary=%t %s | FileCheck --check-prefixes=CHECK,CHECK-X86-32 %s
+; RUN: FileCheck --check-prefixes=SUMMARY,SUMMARY-X86,SUMMARY-X86-32 %s < %t
+
+; RUN: opt -mtriple=x86_64-unknown-linux -S -passes=lowertypetests -lowertypetests-summary-action=export -lowertypetests-read-summary=%S/Inputs/use-typeid1-typeid2.yaml -lowertypetests-write-summary=%t %s | FileCheck --check-prefixes=CHECK,CHECK-64 %s
+; RUN: FileCheck --check-prefixes=SUMMARY,SUMMARY-X86,SUMMARY-64 %s < %t
+
+; RUN: opt -mtriple=aarch64-unknown-linux -S -passes=lowertypetests -lowertypetests-summary-action=export -lowertypetests-read-summary=%S/Inputs/use-typeid1-typeid2.yaml -lowertypetests-write-summary=%t %s | FileCheck --check-prefixes=CHECK,CHECK-64 %s
+; RUN: FileCheck --check-prefixes=SUMMARY,SUMMARY-64,SUMMARY-ARM %s < %t
 
 @foo = constant [2048 x i8] zeroinitializer, !type !0, !type !1, !type !2, !type !3
 
@@ -13,15 +18,19 @@
 
 ; CHECK: [[G:@[0-9]+]] = private constant { [2048 x i8] } zeroinitializer
 
-; CHECK: @__typeid_typeid1_global_addr = hidden alias i8, getelementptr (i8, ptr [[G]], i64 6)
+; CHECK-X86-32: @__typeid_typeid1_global_addr = hidden alias i8, getelementptr (i8, ptr [[G]], i32 6)
+; CHECK-64: @__typeid_typeid1_global_addr = hidden alias i8, getelementptr (i8, ptr [[G]], i64 6)
 ; CHECK-X86: @__typeid_typeid1_align = hidden alias i8, inttoptr (i8 1 to ptr)
 ; CHECK-X86: @__typeid_typeid1_size_m1 = hidden alias i8, inttoptr (i64 3 to ptr)
 ; CHECK-X86: @__typeid_typeid1_inline_bits = hidden alias i8, inttoptr (i32 9 to ptr)
 
-; CHECK: @__typeid_typeid2_global_addr = hidden alias i8, getelementptr (i8, ptr [[G]], i64 136)
+; CHECK-X86-32: @__typeid_typeid2_global_addr = hidden alias i8, getelementptr (i8, ptr [[G]], i32 136)
+; CHECK-64: @__typeid_typeid2_global_addr = hidden alias i8, getelementptr (i8, ptr [[G]], i64 136)
 ; CHECK-X86: @__typeid_typeid2_align = hidden alias i8, inttoptr (i8 2 to ptr)
 ; CHECK-X86: @__typeid_typeid2_size_m1 = hidden alias i8, inttoptr (i64 33 to ptr)
-; CHECK-X86: @__typeid_typeid2_inline_bits = hidden alias i8, inttoptr (i64 8589934593 to ptr)
+; CHECK-X86-64: @__typeid_typeid2_inline_bits = hidden alias i8, inttoptr (i64 8589934593 to ptr)
+; CHECK-X86-32: @__typeid_typeid2_byte_array = hidden alias i8, ptr @bits
+; CHECK-X86-32: @__typeid_typeid2_bit_mask = hidden alias i8, inttoptr (i8 1 to ptr)
 
 ; CHECK: @foo = alias [2048 x i8], ptr [[G]]
 
@@ -41,8 +50,10 @@
 ; SUMMARY-NEXT:     WPDRes:
 ; SUMMARY-NEXT:   typeid2:
 ; SUMMARY-NEXT:     TTRes:
-; SUMMARY-NEXT:       Kind:            Inline
-; SUMMARY-NEXT:       SizeM1BitWidth:  6
+; SUMMARY-X86-32-NEXT: Kind:           ByteArray
+; SUMMARY-X86-32-NEXT: SizeM1BitWidth: 7
+; SUMMARY-64-NEXT:    Kind:            Inline
+; SUMMARY-64-NEXT:    SizeM1BitWidth:  6
 ; SUMMARY-X86-NEXT:   AlignLog2:       0
 ; SUMMARY-X86-NEXT:   SizeM1:          0
 ; SUMMARY-X86-NEXT:   BitMask:         0
diff --git a/llvm/test/Transforms/OpenMP/nested_parallelism.ll b/llvm/test/Transforms/OpenMP/nested_parallelism.ll
index 1679a27..412e5ea 100644
--- a/llvm/test/Transforms/OpenMP/nested_parallelism.ll
+++ b/llvm/test/Transforms/OpenMP/nested_parallelism.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
-; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s
+; RUN: opt -S -mtriple=nvptx64 -passes=openmp-opt < %s | FileCheck %s
+; REQUIRES: nvptx-registered-target
 
 ; void foo1(int i) {
 ;   #pragma omp parallel
@@ -20,8 +21,6 @@
 ;     foo1(i);
 ; }
 
-target triple = "nvptx64"
-
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 %struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32, i32, i32 }
@@ -64,7 +63,8 @@ define weak_odr protected ptx_kernel void @__omp_offloading_10302_bd7e0_main_l13
 ; CHECK-NEXT:    br label [[_Z3FOOI_INTERNALIZED_EXIT]]
 ; CHECK:       _Z3fooi.internalized.exit:
 ; CHECK-NEXT:    tail call void @__kmpc_barrier_simple_spmd(ptr nonnull @[[GLOB1]], i32 [[TMP2]]) #[[ATTR2]]
-; CHECK-NEXT:    store ptr addrspacecast (ptr addrspace(3) @i_shared to ptr), ptr [[CAPTURED_VARS_ADDRS_I]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS_I]] to ptr addrspace(5)
+; CHECK-NEXT:    store ptr addrspacecast (ptr addrspace(3) @i_shared to ptr), ptr addrspace(5) [[TMP4]], align 8
 ; CHECK-NEXT:    call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__, ptr nonnull @__omp_outlined___wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS_I]], i64 1)
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[CAPTURED_VARS_ADDRS_I]])
 ; CHECK-NEXT:    call void @__kmpc_target_deinit()
@@ -109,7 +109,8 @@ define hidden void @_Z3fooi(i32 noundef %i1) local_unnamed_addr #1 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[I:%.*]] = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) #[[ATTR2]]
 ; CHECK-NEXT:    store i32 [[I1:%.*]], ptr [[I]], align 16
-; CHECK-NEXT:    store ptr [[I]], ptr [[CAPTURED_VARS_ADDRS]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS]] to ptr addrspace(5)
+; CHECK-NEXT:    store ptr [[I]], ptr addrspace(5) [[TMP1]], align 8
 ; CHECK-NEXT:    call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__, ptr nonnull @__omp_outlined___wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS]], i64 1)
 ; CHECK-NEXT:    call void @__kmpc_free_shared(ptr [[I]], i64 4) #[[ATTR2]]
 ; CHECK-NEXT:    ret void
@@ -141,7 +142,8 @@ define weak_odr protected ptx_kernel void @__omp_offloading_10302_bd7e0_main_l16
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[CAPTURED_VARS_ADDRS_I]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR2]]
 ; CHECK-NEXT:    store i32 [[I_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr addrspace(3) @i.i_shared, align 16
-; CHECK-NEXT:    store ptr addrspacecast (ptr addrspace(3) @i.i_shared to ptr), ptr [[CAPTURED_VARS_ADDRS_I]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS_I]] to ptr addrspace(5)
+; CHECK-NEXT:    store ptr addrspacecast (ptr addrspace(3) @i.i_shared to ptr), ptr addrspace(5) [[TMP2]], align 8
 ; CHECK-NEXT:    call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS_I]], i64 1)
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[CAPTURED_VARS_ADDRS_I]])
 ; CHECK-NEXT:    call void @__kmpc_target_deinit()
@@ -175,7 +177,8 @@ define hidden void @_Z4foo1i(i32 noundef %i1) local_unnamed_addr #1 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[I:%.*]] = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) #[[ATTR2]]
 ; CHECK-NEXT:    store i32 [[I1:%.*]], ptr [[I]], align 16
-; CHECK-NEXT:    store ptr [[I]], ptr [[CAPTURED_VARS_ADDRS]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS]] to ptr addrspace(5)
+; CHECK-NEXT:    store ptr [[I]], ptr addrspace(5) [[TMP1]], align 8
 ; CHECK-NEXT:    call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS]], i64 1)
 ; CHECK-NEXT:    call void @__kmpc_free_shared(ptr [[I]], i64 4) #[[ATTR2]]
 ; CHECK-NEXT:    ret void
@@ -202,7 +205,8 @@ define internal void @__omp_outlined__(ptr noalias nocapture readnone %.global_t
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[I_I:%.*]] = tail call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) #[[ATTR2]]
 ; CHECK-NEXT:    store i32 [[TMP0]], ptr [[I_I]], align 16
-; CHECK-NEXT:    store ptr [[I_I]], ptr [[CAPTURED_VARS_ADDRS_I]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS_I]] to ptr addrspace(5)
+; CHECK-NEXT:    store ptr [[I_I]], ptr addrspace(5) [[TMP2]], align 8
 ; CHECK-NEXT:    call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS_I]], i64 1)
 ; CHECK-NEXT:    call void @__kmpc_free_shared(ptr [[I_I]], i64 4) #[[ATTR2]]
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[CAPTURED_VARS_ADDRS_I]])
@@ -228,15 +232,17 @@ define internal void @__omp_outlined___wrapper(i16 zeroext %0, i32 %1) #5 {
 ; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS_I_I:%.*]] = alloca [1 x ptr], align 8
 ; CHECK-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
 ; CHECK-NEXT:    call void @__kmpc_get_shared_variables(ptr nonnull [[GLOBAL_ARGS]])
-; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = addrspacecast ptr [[GLOBAL_ARGS]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[CAPTURED_VARS_ADDRS_I_I]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[I_I_I:%.*]] = call align 16 dereferenceable_or_null(4) ptr @__kmpc_alloc_shared(i64 4) #[[ATTR2]]
 ; CHECK-NEXT:    store i32 [[TMP4]], ptr [[I_I_I]], align 16
-; CHECK-NEXT:    store ptr [[I_I_I]], ptr [[CAPTURED_VARS_ADDRS_I_I]], align 8
-; CHECK-NEXT:    call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS_I_I]], i64 1)
+; CHECK-NEXT:    [[TMP7:%.*]] = addrspacecast ptr [[CAPTURED_VARS_ADDRS_I_I]] to ptr addrspace(5)
+; CHECK-NEXT:    store ptr [[I_I_I]], ptr addrspace(5) [[TMP7]], align 8
+; CHECK-NEXT:    call void @__kmpc_parallel_51(ptr nonnull @[[GLOB1]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, ptr nonnull @__omp_outlined__1, ptr nonnull @__omp_outlined__1_wrapper, ptr nonnull [[CAPTURED_VARS_ADDRS_I_I]], i64 1)
 ; CHECK-NEXT:    call void @__kmpc_free_shared(ptr [[I_I_I]], i64 4) #[[ATTR2]]
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[CAPTURED_VARS_ADDRS_I_I]])
 ; CHECK-NEXT:    ret void
@@ -287,7 +293,8 @@ define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #5 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
 ; CHECK-NEXT:    call void @__kmpc_get_shared_variables(ptr nonnull [[GLOBAL_ARGS]])
-; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = addrspacecast ptr [[GLOBAL_ARGS]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INC_I:%.*]] = add nsw i32 [[TMP4]], 1
diff --git a/llvm/test/Transforms/OpenMP/remove_globalization.ll b/llvm/test/Transforms/OpenMP/remove_globalization.ll
index 29f2030..419d3d0 100644
--- a/llvm/test/Transforms/OpenMP/remove_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/remove_globalization.ll
@@ -1,9 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
 ; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s
-; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -pass-remarks-missed=openmp-opt -disable-output < %s 2>&1 | FileCheck %s -check-prefix=CHECK-REMARKS
-; RUN: opt -openmp-opt-disable-deglobalization -S -passes=openmp-opt < %s | FileCheck %s --check-prefix=CHECK-DISABLED
-target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64"
+; RUN: opt -mtriple=nvptx64 -passes=openmp-opt -pass-remarks=openmp-opt -pass-remarks-missed=openmp-opt -disable-output < %s 2>&1 | FileCheck %s -check-prefix=CHECK-REMARKS
+; RUN: opt -mtriple=nvptx64 -openmp-opt-disable-deglobalization -S -passes=openmp-opt < %s | FileCheck %s --check-prefix=CHECK-DISABLED
+; REQUIRES: nvptx-registered-target
 
 @S = external local_unnamed_addr global ptr
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
@@ -183,17 +182,22 @@ define internal void @convert_and_move_alloca() {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTH2S:%.*]] = alloca i8, i64 4, align 4
 ; CHECK-DISABLED-NEXT:    [[IV_PTR:%.*]] = alloca i32, align 4
+; CHECK-DISABLED-NEXT:    [[UB_PTR:%.*]] = alloca i32, align 4
+; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[UB_PTR]] to ptr addrspace(5)
 ; CHECK-DISABLED-NEXT:    br label [[INITLOOP:%.*]]
 ; CHECK-DISABLED:       initloop:
-; CHECK-DISABLED-NEXT:    store i32 0, ptr [[IV_PTR]], align 4
+; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[IV_PTR]] to ptr addrspace(5)
+; CHECK-DISABLED-NEXT:    store i32 0, ptr addrspace(5) [[TMP1]], align 4
 ; CHECK-DISABLED-NEXT:    br label [[LOOPBODY:%.*]]
 ; CHECK-DISABLED:       loopbody:
-; CHECK-DISABLED-NEXT:    [[IV:%.*]] = load i32, ptr [[IV_PTR]], align 4
-; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[IV]], 10
-; CHECK-DISABLED-NEXT:    br i1 [[TMP0]], label [[EXIT:%.*]], label [[LOOPINC:%.*]]
+; CHECK-DISABLED-NEXT:    [[TMP2:%.*]] = addrspacecast ptr [[IV_PTR]] to ptr addrspace(5)
+; CHECK-DISABLED-NEXT:    [[IV:%.*]] = load i32, ptr addrspace(5) [[TMP2]], align 4
+; CHECK-DISABLED-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[IV]], 10
+; CHECK-DISABLED-NEXT:    br i1 [[TMP3]], label [[EXIT:%.*]], label [[LOOPINC:%.*]]
 ; CHECK-DISABLED:       loopinc:
 ; CHECK-DISABLED-NEXT:    [[INC:%.*]] = add i32 [[IV]], 1
-; CHECK-DISABLED-NEXT:    store i32 [[INC]], ptr [[IV_PTR]], align 4
+; CHECK-DISABLED-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[IV_PTR]] to ptr addrspace(5)
+; CHECK-DISABLED-NEXT:    store i32 [[INC]], ptr addrspace(5) [[TMP4]], align 4
 ; CHECK-DISABLED-NEXT:    br label [[LOOPBODY]]
 ; CHECK-DISABLED:       exit:
 ; CHECK-DISABLED-NEXT:    ret void
@@ -268,7 +272,7 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ; CHECK-DISABLED: attributes #[[ATTR6]] = { nounwind }
 ;.
 ; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
-; CHECK: [[META1]] = !DIFile(filename: "remove_globalization.c", directory: {{.*}})
+; CHECK: [[META1]] = !DIFile(filename: "{{.*}}remove_globalization.c", directory: {{.*}})
 ; CHECK: [[META2]] = !{}
 ; CHECK: [[META3:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
 ; CHECK: [[META4:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
@@ -279,7 +283,7 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ; CHECK: [[META9]] = !DISubroutineType(types: [[META2]])
 ;.
 ; CHECK-DISABLED: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
-; CHECK-DISABLED: [[META1]] = !DIFile(filename: "remove_globalization.c", directory: {{.*}})
+; CHECK-DISABLED: [[META1]] = !DIFile(filename: "{{.*}}remove_globalization.c", directory: {{.*}})
 ; CHECK-DISABLED: [[META2]] = !{}
 ; CHECK-DISABLED: [[META3:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
 ; CHECK-DISABLED: [[META4:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
index 809fc39..d057e5b 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
@@ -71,28 +71,29 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %
 ; CHECK:       user_code.entry:
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i64 [[N]], 42
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[C]], ptr [[AL32]], ptr addrspacecast (ptr addrspace(5) @LocGlob to ptr)
-; CHECK-NEXT:    store ptr [[SELECT]], ptr [[LOC]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]]
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[LOC]] to ptr addrspace(5)
+; CHECK-NEXT:    store ptr [[SELECT]], ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]]
 ; CHECK-NEXT:    [[ARRAYIDX1_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 1
 ; CHECK-NEXT:    [[SEXT:%.*]] = shl i64 [[N]], 32
 ; CHECK-NEXT:    [[IDXPROM_I:%.*]] = ashr exact i64 [[SEXT]], 32
 ; CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM_I]]
 ; CHECK-NEXT:    br label [[REGION_CHECK_TID:%.*]]
 ; CHECK:       region.check.tid:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[TMP3]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
 ; CHECK:       region.guarded:
 ; CHECK-NEXT:    store i32 0, ptr [[X]], align 4, !noalias [[META7:![0-9]+]]
-; CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[ARRAYIDX1_I]] to ptr addrspace(1)
-; CHECK-NEXT:    store i32 1, ptr addrspace(1) [[TMP4]], align 4, !noalias [[META7]]
-; CHECK-NEXT:    [[TMP5:%.*]] = addrspacecast ptr [[ARRAYIDX2_I]] to ptr addrspace(1)
-; CHECK-NEXT:    store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr addrspace(1) [[TMP5]], align 4, !noalias [[META7]]
+; CHECK-NEXT:    [[TMP5:%.*]] = addrspacecast ptr [[ARRAYIDX1_I]] to ptr addrspace(1)
+; CHECK-NEXT:    store i32 1, ptr addrspace(1) [[TMP5]], align 4, !noalias [[META7]]
+; CHECK-NEXT:    [[TMP6:%.*]] = addrspacecast ptr [[ARRAYIDX2_I]] to ptr addrspace(1)
+; CHECK-NEXT:    store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr addrspace(1) [[TMP6]], align 4, !noalias [[META7]]
 ; CHECK-NEXT:    br label [[REGION_GUARDED_END:%.*]]
 ; CHECK:       region.guarded.end:
 ; CHECK-NEXT:    br label [[REGION_BARRIER]]
 ; CHECK:       region.barrier:
-; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP2]])
+; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP3]])
 ; CHECK-NEXT:    br label [[REGION_EXIT:%.*]]
 ; CHECK:       region.exit:
 ; CHECK-NEXT:    call void @usei8ptr(ptr captures(none) [[HEAP2STACK_H2S]]) #[[ATTR9:[0-9]+]]
@@ -108,19 +109,20 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %
 ; CHECK-NEXT:    [[ARRAYIDX5_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM4_I]]
 ; CHECK-NEXT:    br label [[REGION_CHECK_TID5:%.*]]
 ; CHECK:       region.check.tid5:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
-; CHECK-NEXT:    br i1 [[TMP7]], label [[REGION_GUARDED4:%.*]], label [[REGION_BARRIER2:%.*]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[REGION_GUARDED4:%.*]], label [[REGION_BARRIER2:%.*]]
 ; CHECK:       region.guarded4:
-; CHECK-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[ARRAYIDX5_I]] to ptr addrspace(1)
-; CHECK-NEXT:    store i32 [[SUB3_I]], ptr addrspace(1) [[TMP8]], align 4, !noalias [[META7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[ARRAYIDX5_I]] to ptr addrspace(1)
+; CHECK-NEXT:    store i32 [[SUB3_I]], ptr addrspace(1) [[TMP9]], align 4, !noalias [[META7]]
 ; CHECK-NEXT:    br label [[REGION_GUARDED_END1:%.*]]
 ; CHECK:       region.guarded.end1:
 ; CHECK-NEXT:    br label [[REGION_BARRIER2]]
 ; CHECK:       region.barrier2:
-; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP6]])
+; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP7]])
 ; CHECK-NEXT:    br label [[REGION_EXIT3]]
 ; CHECK:       region.exit3:
+; CHECK-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[SELECT]] to ptr addrspace(5)
 ; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1
 ; CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       __omp_outlined__.exit:
@@ -130,17 +132,17 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %
 ; CHECK-NEXT:    [[ARRAYIDX7_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM6_I]]
 ; CHECK-NEXT:    br label [[REGION_CHECK_TID10:%.*]]
 ; CHECK:       region.check.tid10:
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 0
-; CHECK-NEXT:    br i1 [[TMP10]], label [[REGION_GUARDED9:%.*]], label [[REGION_BARRIER7:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[TMP12]], label [[REGION_GUARDED9:%.*]], label [[REGION_BARRIER7:%.*]]
 ; CHECK:       region.guarded9:
-; CHECK-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[ARRAYIDX7_I]] to ptr addrspace(1)
-; CHECK-NEXT:    store i32 [[CALL_I]], ptr addrspace(1) [[TMP11]], align 4, !noalias [[META7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = addrspacecast ptr [[ARRAYIDX7_I]] to ptr addrspace(1)
+; CHECK-NEXT:    store i32 [[CALL_I]], ptr addrspace(1) [[TMP13]], align 4, !noalias [[META7]]
 ; CHECK-NEXT:    br label [[REGION_GUARDED_END6:%.*]]
 ; CHECK:       region.guarded.end6:
 ; CHECK-NEXT:    br label [[REGION_BARRIER7]]
 ; CHECK:       region.barrier7:
-; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP9]])
+; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP11]])
 ; CHECK-NEXT:    br label [[REGION_EXIT8:%.*]]
 ; CHECK:       region.exit8:
 ; CHECK-NEXT:    [[CALL8_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
@@ -148,17 +150,17 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %
 ; CHECK-NEXT:    [[ARRAYIDX10_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM9_I]]
 ; CHECK-NEXT:    br label [[REGION_CHECK_TID15:%.*]]
 ; CHECK:       region.check.tid15:
-; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 0
-; CHECK-NEXT:    br i1 [[TMP13]], label [[REGION_GUARDED14:%.*]], label [[REGION_BARRIER12:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[TMP15]], label [[REGION_GUARDED14:%.*]], label [[REGION_BARRIER12:%.*]]
 ; CHECK:       region.guarded14:
-; CHECK-NEXT:    [[TMP14:%.*]] = addrspacecast ptr [[ARRAYIDX10_I]] to ptr addrspace(1)
-; CHECK-NEXT:    store i32 [[CALL8_I]], ptr addrspace(1) [[TMP14]], align 4, !noalias [[META7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = addrspacecast ptr [[ARRAYIDX10_I]] to ptr addrspace(1)
+; CHECK-NEXT:    store i32 [[CALL8_I]], ptr addrspace(1) [[TMP16]], align 4, !noalias [[META7]]
 ; CHECK-NEXT:    br label [[REGION_GUARDED_END11:%.*]]
 ; CHECK:       region.guarded.end11:
 ; CHECK-NEXT:    br label [[REGION_BARRIER12]]
 ; CHECK:       region.barrier12:
-; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP12]])
+; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP14]])
 ; CHECK-NEXT:    br label [[REGION_EXIT13:%.*]]
 ; CHECK:       region.exit13:
 ; CHECK-NEXT:    [[CALL11_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
@@ -166,17 +168,17 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %
 ; CHECK-NEXT:    [[ARRAYIDX13_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM12_I]]
 ; CHECK-NEXT:    br label [[REGION_CHECK_TID20:%.*]]
 ; CHECK:       region.check.tid20:
-; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 0
-; CHECK-NEXT:    br i1 [[TMP16]], label [[REGION_GUARDED19:%.*]], label [[REGION_BARRIER17:%.*]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP17]], 0
+; CHECK-NEXT:    br i1 [[TMP18]], label [[REGION_GUARDED19:%.*]], label [[REGION_BARRIER17:%.*]]
 ; CHECK:       region.guarded19:
-; CHECK-NEXT:    [[TMP17:%.*]] = addrspacecast ptr [[ARRAYIDX13_I]] to ptr addrspace(1)
-; CHECK-NEXT:    store i32 [[CALL11_I]], ptr addrspace(1) [[TMP17]], align 4, !noalias [[META7]]
+; CHECK-NEXT:    [[TMP19:%.*]] = addrspacecast ptr [[ARRAYIDX13_I]] to ptr addrspace(1)
+; CHECK-NEXT:    store i32 [[CALL11_I]], ptr addrspace(1) [[TMP19]], align 4, !noalias [[META7]]
 ; CHECK-NEXT:    br label [[REGION_GUARDED_END16:%.*]]
 ; CHECK:       region.guarded.end16:
 ; CHECK-NEXT:    br label [[REGION_BARRIER17]]
 ; CHECK:       region.barrier17:
-; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP15]])
+; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP17]])
 ; CHECK-NEXT:    br label [[REGION_EXIT18:%.*]]
 ; CHECK:       region.exit18:
 ; CHECK-NEXT:    [[CALL14_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
@@ -233,17 +235,18 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %
 ; CHECK-DISABLED:       user_code.entry:
 ; CHECK-DISABLED-NEXT:    [[C:%.*]] = icmp eq i64 [[N]], 42
 ; CHECK-DISABLED-NEXT:    [[SELECT:%.*]] = select i1 [[C]], ptr [[AL32]], ptr addrspacecast (ptr addrspace(5) @LocGlob to ptr)
-; CHECK-DISABLED-NEXT:    store ptr [[SELECT]], ptr [[LOC]], align 8
-; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]]
+; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[LOC]] to ptr addrspace(5)
+; CHECK-DISABLED-NEXT:    store ptr [[SELECT]], ptr addrspace(5) [[TMP1]], align 8
+; CHECK-DISABLED-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]]
 ; CHECK-DISABLED-NEXT:    store i32 0, ptr [[X]], align 4, !noalias [[META7:![0-9]+]]
 ; CHECK-DISABLED-NEXT:    [[ARRAYIDX1_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 1
-; CHECK-DISABLED-NEXT:    [[TMP2:%.*]] = addrspacecast ptr [[ARRAYIDX1_I]] to ptr addrspace(1)
-; CHECK-DISABLED-NEXT:    store i32 1, ptr addrspace(1) [[TMP2]], align 4, !noalias [[META7]]
+; CHECK-DISABLED-NEXT:    [[TMP3:%.*]] = addrspacecast ptr [[ARRAYIDX1_I]] to ptr addrspace(1)
+; CHECK-DISABLED-NEXT:    store i32 1, ptr addrspace(1) [[TMP3]], align 4, !noalias [[META7]]
 ; CHECK-DISABLED-NEXT:    [[SEXT:%.*]] = shl i64 [[N]], 32
 ; CHECK-DISABLED-NEXT:    [[IDXPROM_I:%.*]] = ashr exact i64 [[SEXT]], 32
 ; CHECK-DISABLED-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM_I]]
-; CHECK-DISABLED-NEXT:    [[TMP3:%.*]] = addrspacecast ptr [[ARRAYIDX2_I]] to ptr addrspace(1)
-; CHECK-DISABLED-NEXT:    store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr addrspace(1) [[TMP3]], align 4, !noalias [[META7]]
+; CHECK-DISABLED-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[ARRAYIDX2_I]] to ptr addrspace(1)
+; CHECK-DISABLED-NEXT:    store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr addrspace(1) [[TMP4]], align 4, !noalias [[META7]]
 ; CHECK-DISABLED-NEXT:    call void @usei8ptr(ptr captures(none) [[HEAP2STACK_H2S]]) #[[ATTR9:[0-9]+]]
 ; CHECK-DISABLED-NEXT:    br label [[FOR_COND_I:%.*]]
 ; CHECK-DISABLED:       for.cond.i:
@@ -255,8 +258,9 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %
 ; CHECK-DISABLED-NEXT:    [[SUB3_I:%.*]] = add nsw i32 [[I_0_I]], -1
 ; CHECK-DISABLED-NEXT:    [[IDXPROM4_I:%.*]] = zext i32 [[I_0_I]] to i64
 ; CHECK-DISABLED-NEXT:    [[ARRAYIDX5_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM4_I]]
-; CHECK-DISABLED-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[ARRAYIDX5_I]] to ptr addrspace(1)
-; CHECK-DISABLED-NEXT:    store i32 [[SUB3_I]], ptr addrspace(1) [[TMP4]], align 4, !noalias [[META7]]
+; CHECK-DISABLED-NEXT:    [[TMP5:%.*]] = addrspacecast ptr [[ARRAYIDX5_I]] to ptr addrspace(1)
+; CHECK-DISABLED-NEXT:    store i32 [[SUB3_I]], ptr addrspace(1) [[TMP5]], align 4, !noalias [[META7]]
+; CHECK-DISABLED-NEXT:    [[TMP6:%.*]] = addrspacecast ptr [[SELECT]] to ptr addrspace(5)
 ; CHECK-DISABLED-NEXT:    [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1
 ; CHECK-DISABLED-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-DISABLED:       __omp_outlined__.exit:
@@ -264,18 +268,18 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %
 ; CHECK-DISABLED-NEXT:    [[CALL_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10:[0-9]+]], !noalias [[META7]]
 ; CHECK-DISABLED-NEXT:    [[IDXPROM6_I:%.*]] = sext i32 [[CALL_I]] to i64
 ; CHECK-DISABLED-NEXT:    [[ARRAYIDX7_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM6_I]]
-; CHECK-DISABLED-NEXT:    [[TMP5:%.*]] = addrspacecast ptr [[ARRAYIDX7_I]] to ptr addrspace(1)
-; CHECK-DISABLED-NEXT:    store i32 [[CALL_I]], ptr addrspace(1) [[TMP5]], align 4, !noalias [[META7]]
+; CHECK-DISABLED-NEXT:    [[TMP7:%.*]] = addrspacecast ptr [[ARRAYIDX7_I]] to ptr addrspace(1)
+; CHECK-DISABLED-NEXT:    store i32 [[CALL_I]], ptr addrspace(1) [[TMP7]], align 4, !noalias [[META7]]
 ; CHECK-DISABLED-NEXT:    [[CALL8_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
 ; CHECK-DISABLED-NEXT:    [[IDXPROM9_I:%.*]] = sext i32 [[CALL8_I]] to i64
 ; CHECK-DISABLED-NEXT:    [[ARRAYIDX10_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM9_I]]
-; CHECK-DISABLED-NEXT:    [[TMP6:%.*]] = addrspacecast ptr [[ARRAYIDX10_I]] to ptr addrspace(1)
-; CHECK-DISABLED-NEXT:    store i32 [[CALL8_I]], ptr addrspace(1) [[TMP6]], align 4, !noalias [[META7]]
+; CHECK-DISABLED-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[ARRAYIDX10_I]] to ptr addrspace(1)
+; CHECK-DISABLED-NEXT:    store i32 [[CALL8_I]], ptr addrspace(1) [[TMP8]], align 4, !noalias [[META7]]
 ; CHECK-DISABLED-NEXT:    [[CALL11_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
 ; CHECK-DISABLED-NEXT:    [[IDXPROM12_I:%.*]] = sext i32 [[CALL11_I]] to i64
 ; CHECK-DISABLED-NEXT:    [[ARRAYIDX13_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM12_I]]
-; CHECK-DISABLED-NEXT:    [[TMP7:%.*]] = addrspacecast ptr [[ARRAYIDX13_I]] to ptr addrspace(1)
-; CHECK-DISABLED-NEXT:    store i32 [[CALL11_I]], ptr addrspace(1) [[TMP7]], align 4, !noalias [[META7]]
+; CHECK-DISABLED-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[ARRAYIDX13_I]] to ptr addrspace(1)
+; CHECK-DISABLED-NEXT:    store i32 [[CALL11_I]], ptr addrspace(1) [[TMP9]], align 4, !noalias [[META7]]
 ; CHECK-DISABLED-NEXT:    [[CALL14_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
 ; CHECK-DISABLED-NEXT:    [[CALL15_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
 ; CHECK-DISABLED-NEXT:    [[CALL16_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
index a644fe1..821f5b1 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=CHECK
-; RUN: opt -S -passes=openmp-opt -openmp-opt-disable-spmdization < %s | FileCheck %s --check-prefixes=CHECK-DISABLE-SPMDIZATION
+; RUN: opt -S -mtriple=nvptx64 -passes=openmp-opt < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: opt -S -mtriple=nvptx64 -passes=openmp-opt -openmp-opt-disable-spmdization < %s | FileCheck %s --check-prefixes=CHECK-DISABLE-SPMDIZATION
+; REQUIRES: nvptx-registered-target
 ;
 ; int G;
 ;
@@ -26,8 +27,6 @@
 ;   generic_helper();
 ; }
 ;
-target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32, i32, i32 }
@@ -195,6 +194,8 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 ; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5)
 ; CHECK-NEXT:    call void @unknown() #[[ATTR8:[0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
@@ -203,6 +204,8 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-DISABLE-SPMDIZATION-NEXT:  entry:
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+; CHECK-DISABLE-SPMDIZATION-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5)
+; CHECK-DISABLE-SPMDIZATION-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5)
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @unknown() #[[ATTR8:[0-9]+]]
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    ret void
 ;
@@ -224,6 +227,9 @@ define internal void @__omp_outlined___wrapper(i16 zeroext %0, i32 %1) #2 {
 ; CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5)
 ; CHECK-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 ; CHECK-NEXT:    call void @__omp_outlined__(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-NEXT:    ret void
@@ -235,6 +241,9 @@ define internal void @__omp_outlined___wrapper(i16 zeroext %0, i32 %1) #2 {
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
+; CHECK-DISABLE-SPMDIZATION-NEXT:    [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5)
+; CHECK-DISABLE-SPMDIZATION-NEXT:    [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5)
+; CHECK-DISABLE-SPMDIZATION-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5)
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @__omp_outlined__(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]]
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    ret void
diff --git a/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll b/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
index 1cfce14..17e6803 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=CHECK
-; RUN: opt -S -passes=openmp-opt -openmp-opt-disable-spmdization < %s | FileCheck %s --check-prefixes=CHECK-DISABLE-SPMDIZATION
+; RUN: opt -S -mtriple=nvptx64 -passes=openmp-opt < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: opt -S -mtriple=nvptx64 -passes=openmp-opt -openmp-opt-disable-spmdization < %s | FileCheck %s --check-prefixes=CHECK-DISABLE-SPMDIZATION
+; REQUIRES: nvptx-registered-target
 ;
 ; __local int G;
 ;
@@ -27,8 +28,6 @@
 ;   generic_helper();
 ; }
 ;
-target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32, i32, i32 }
@@ -281,6 +280,8 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 ; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5)
 ; CHECK-NEXT:    call void @leaf() #[[ATTR8]]
 ; CHECK-NEXT:    ret void
 ;
@@ -289,6 +290,8 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-DISABLE-SPMDIZATION-NEXT:  entry:
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+; CHECK-DISABLE-SPMDIZATION-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[DOTGLOBAL_TID__ADDR]] to ptr addrspace(5)
+; CHECK-DISABLE-SPMDIZATION-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[DOTBOUND_TID__ADDR]] to ptr addrspace(5)
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @leaf() #[[ATTR8]]
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    ret void
 ;
@@ -310,6 +313,9 @@ define internal void @__omp_outlined___wrapper(i16 zeroext %0, i32 %1) #2 {
 ; CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5)
+; CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5)
 ; CHECK-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 ; CHECK-NEXT:    call void @__omp_outlined__(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR8]]
 ; CHECK-NEXT:    ret void
@@ -321,6 +327,9 @@ define internal void @__omp_outlined___wrapper(i16 zeroext %0, i32 %1) #2 {
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
+; CHECK-DISABLE-SPMDIZATION-NEXT:    [[TMP2:%.*]] = addrspacecast ptr [[DOTADDR]] to ptr addrspace(5)
+; CHECK-DISABLE-SPMDIZATION-NEXT:    [[TMP3:%.*]] = addrspacecast ptr [[DOTADDR1]] to ptr addrspace(5)
+; CHECK-DISABLE-SPMDIZATION-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[DOTZERO_ADDR]] to ptr addrspace(5)
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @__omp_outlined__(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR8]]
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    ret void
diff --git a/llvm/test/Verifier/target-ext-vector-invalid.ll b/llvm/test/Verifier/target-ext-vector-invalid.ll
new file mode 100644
index 0000000..59e3e78
--- /dev/null
+++ b/llvm/test/Verifier/target-ext-vector-invalid.ll
@@ -0,0 +1,8 @@
+; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
+
+; CHECK: invalid vector element type
+
+define void @bad() {
+  %v = alloca <2 x target("spirv.Image")>
+  ret void
+}
+\ No newline at end of file
diff --git a/llvm/test/Verifier/target-ext-vector.ll b/llvm/test/Verifier/target-ext-vector.ll
new file mode 100644
index 0000000..433715b
--- /dev/null
+++ b/llvm/test/Verifier/target-ext-vector.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=verify -S %s | FileCheck %s
+
+define <2 x target("llvm.test.vectorelement")> @vec_ops(<2 x target("llvm.test.vectorelement")> %x) {
+; CHECK-LABEL: define <2 x target("llvm.test.vectorelement")> @vec_ops(
+; CHECK-SAME: <2 x target("llvm.test.vectorelement")> [[X:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = alloca <2 x target("llvm.test.vectorelement")>{{.*}}
+; CHECK-NEXT:    store <2 x target("llvm.test.vectorelement")> [[X]], ptr [[A]], {{.*}}
+; CHECK-NEXT:    [[LOAD:%.*]] = load <2 x target("llvm.test.vectorelement")>, ptr [[A]], {{.*}}
+; CHECK-NEXT:    [[ELT:%.*]] = extractelement <2 x target("llvm.test.vectorelement")> [[LOAD]], i64 0
+; CHECK-NEXT:    [[RES:%.*]] = insertelement <2 x target("llvm.test.vectorelement")> poison, target("llvm.test.vectorelement") [[ELT]], i64 1
+; CHECK-NEXT:    ret <2 x target("llvm.test.vectorelement")> [[RES]]
+;
+  %a = alloca <2 x target("llvm.test.vectorelement")>
+  store <2 x target("llvm.test.vectorelement")> %x, ptr %a
+  %load = load <2 x target("llvm.test.vectorelement")>, ptr %a
+  %elt = extractelement <2 x target("llvm.test.vectorelement")> %load, i64 0
+  %res = insertelement <2 x target("llvm.test.vectorelement")> poison, target("llvm.test.vectorelement") %elt, i64 1
+  ret <2 x target("llvm.test.vectorelement")> %res
+}
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s
index 581dad6..54b5f16 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s
@@ -2536,14 +2536,14 @@ drps
 # CHECK-NEXT:  1      2     0.50                        bics	x3, xzr, x3, lsl #1
 # CHECK-NEXT:  1      2     0.50                        tst	w3, w7, lsl #31
 # CHECK-NEXT:  1      2     0.50                        tst	x2, x20, asr #2
-# CHECK-NEXT:  1      0     0.06                        mov	x3, x6
-# CHECK-NEXT:  1      0     0.06                        mov	x3, xzr
-# CHECK-NEXT:  1      0     0.06                        mov	wzr, w2
-# CHECK-NEXT:  1      0     0.06                        mov	w3, w5
+# CHECK-NEXT:  1      0     0.17                        mov	x3, x6
+# CHECK-NEXT:  1      0     0.17                        mov	x3, xzr
+# CHECK-NEXT:  1      0     0.17                        mov	wzr, w2
+# CHECK-NEXT:  1      0     0.17                        mov	w3, w5
 # CHECK-NEXT:  1      1     0.17                        movz	w2, #0, lsl #16
 # CHECK-NEXT:  1      1     0.17                        mov	w2, #-1235
 # CHECK-NEXT:  1      1     0.17                        mov	x2, #5299989643264
-# CHECK-NEXT:  1      0     0.06                        mov	x2, #0
+# CHECK-NEXT:  1      0     0.17                        mov	x2, #0
 # CHECK-NEXT:  1      1     0.17                        movk	w3, #0
 # CHECK-NEXT:  1      1     0.17                        movz	x4, #0, lsl #16
 # CHECK-NEXT:  1      1     0.17                        movk	w5, #0, lsl #16
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
index fbf65e2..3398331 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
@@ -58,7 +58,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -116,8 +116,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	b0, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	b0, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ldr	b0, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -126,9 +126,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	b0, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ldr	b0, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [1] Code Region - FPR16-bit
 
@@ -137,7 +137,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -195,8 +195,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	h0, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	h0, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ldr	h0, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -205,9 +205,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	h0, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ldr	h0, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [2] Code Region - FPR32-bit
 
@@ -216,7 +216,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -274,8 +274,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	s0, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	s0, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ldr	s0, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -284,9 +284,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	s0, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ldr	s0, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [3] Code Region - FPR64-bit
 
@@ -295,7 +295,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -353,8 +353,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	d0, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	d0, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ldr	d0, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -363,9 +363,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	d0, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ldr	d0, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [4] Code Region - FPR128-bit
 
@@ -374,7 +374,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -432,8 +432,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	q0, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	q0, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ldr	q0, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -442,9 +442,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	q0, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ldr	q0, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [5] Code Region - SIMD64-bit-b
 
@@ -453,7 +453,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -511,8 +511,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.8b }, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -521,9 +521,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [6] Code Region - SIMD64-bit-h
 
@@ -532,7 +532,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -590,8 +590,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.4h }, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -600,9 +600,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [7] Code Region - SIMD64-bit-s
 
@@ -611,7 +611,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -669,8 +669,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.2s }, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -679,9 +679,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [8] Code Region - SIMD64-bit-d
 
@@ -690,7 +690,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -748,8 +748,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.1d }, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -758,9 +758,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [9] Code Region - insr
 
@@ -769,7 +769,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      803
 # CHECK-NEXT: Total uOps:        300
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.37
 # CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -825,10 +825,10 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [0,1]     D======eeER    .    .    .    .   .   add	z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,0]     D========eeeeeeER   .    .    .   .   insr	z0.s, w0
 # CHECK-NEXT: [1,1]     D==============eeER .    .    .   .   add	z0.s, z0.s, z0.s
-# CHECK-NEXT: [2,0]     D================eeeeeeER.    .   .   insr	z0.s, w0
-# CHECK-NEXT: [2,1]     D======================eeER   .   .   add	z0.s, z0.s, z0.s
-# CHECK-NEXT: [3,0]     D========================eeeeeeER .   insr	z0.s, w0
-# CHECK-NEXT: [3,1]     D==============================eeER   add	z0.s, z0.s, z0.s
+# CHECK-NEXT: [2,0]     .D===============eeeeeeER.    .   .   insr	z0.s, w0
+# CHECK-NEXT: [2,1]     .D=====================eeER   .   .   add	z0.s, z0.s, z0.s
+# CHECK-NEXT: [3,0]     .D=======================eeeeeeER .   insr	z0.s, w0
+# CHECK-NEXT: [3,1]     .D=============================eeER   add	z0.s, z0.s, z0.s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -837,6 +837,6 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     13.0   0.3    0.0       insr	z0.s, w0
-# CHECK-NEXT: 1.     4     19.0   0.0    0.0       add	z0.s, z0.s, z0.s
-# CHECK-NEXT:        4     16.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     4     12.5   0.3    0.0       insr	z0.s, w0
+# CHECK-NEXT: 1.     4     18.5   0.0    0.0       add	z0.s, z0.s, z0.s
+# CHECK-NEXT:        4     15.5   0.1    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s
index 0f5ab18..39a779b 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s
@@ -315,7 +315,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.57
 # CHECK-NEXT: IPC:               0.57
 # CHECK-NEXT: Block RThroughput: 3.0
@@ -330,8 +330,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=====eeER.    ..   madd	x0, x0, x0, x0
 # CHECK-NEXT: [1,0]     D=======eeER   ..   mul	x0, x0, x0
 # CHECK-NEXT: [1,1]     D=========eeER ..   madd	x0, x1, x2, x0
-# CHECK-NEXT: [1,2]     D==========eeER..   madd	x0, x1, x2, x0
-# CHECK-NEXT: [1,3]     D============eeER   madd	x0, x0, x0, x0
+# CHECK-NEXT: [1,2]     .D=========eeER..   madd	x0, x1, x2, x0
+# CHECK-NEXT: [1,3]     .D===========eeER   madd	x0, x0, x0, x0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -342,9 +342,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	x0, x0, x0
 # CHECK-NEXT: 1.     2     6.5    0.0    0.0       madd	x0, x1, x2, x0
-# CHECK-NEXT: 2.     2     7.5    0.0    0.0       madd	x0, x1, x2, x0
-# CHECK-NEXT: 3.     2     9.5    0.0    0.0       madd	x0, x0, x0, x0
-# CHECK-NEXT:        2     7.0    0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     7.0    0.0    0.0       madd	x0, x1, x2, x0
+# CHECK-NEXT: 3.     2     9.0    0.0    0.0       madd	x0, x0, x0, x0
+# CHECK-NEXT:        2     6.8    0.1    0.0       <total>
 
 # CHECK:      [1] Code Region - smaddl
 
@@ -353,7 +353,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.57
 # CHECK-NEXT: IPC:               0.57
 # CHECK-NEXT: Block RThroughput: 3.0
@@ -368,8 +368,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=====eeER.    ..   smaddl	x0, w0, w0, x0
 # CHECK-NEXT: [1,0]     D=======eeER   ..   mul	x0, x0, x0
 # CHECK-NEXT: [1,1]     D=========eeER ..   smaddl	x0, w1, w2, x0
-# CHECK-NEXT: [1,2]     D==========eeER..   smaddl	x0, w1, w2, x0
-# CHECK-NEXT: [1,3]     D============eeER   smaddl	x0, w0, w0, x0
+# CHECK-NEXT: [1,2]     .D=========eeER..   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [1,3]     .D===========eeER   smaddl	x0, w0, w0, x0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -380,9 +380,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	x0, x0, x0
 # CHECK-NEXT: 1.     2     6.5    0.0    0.0       smaddl	x0, w1, w2, x0
-# CHECK-NEXT: 2.     2     7.5    0.0    0.0       smaddl	x0, w1, w2, x0
-# CHECK-NEXT: 3.     2     9.5    0.0    0.0       smaddl	x0, w0, w0, x0
-# CHECK-NEXT:        2     7.0    0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     7.0    0.0    0.0       smaddl	x0, w1, w2, x0
+# CHECK-NEXT: 3.     2     9.0    0.0    0.0       smaddl	x0, w0, w0, x0
+# CHECK-NEXT:        2     6.8    0.1    0.0       <total>
 
 # CHECK:      [2] Code Region - fmadd
 
@@ -391,7 +391,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1703
 # CHECK-NEXT: Total uOps:        600
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.35
 # CHECK-NEXT: IPC:               0.35
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -406,12 +406,12 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=======eeeeER .    .    .    .    ..   fmadd	d0, d1, d2, d0
 # CHECK-NEXT: [0,4]     D=========eeeeER    .    .    .    ..   fmadd	d0, d1, d2, d0
 # CHECK-NEXT: [0,5]     D=============eeeeER.    .    .    ..   fmadd	d0, d0, d1, d2
-# CHECK-NEXT: [1,0]     D=================eeER   .    .    ..   fadd	d0, d0, d0
-# CHECK-NEXT: [1,1]     D===================eeeeER    .    ..   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [1,2]     D=======================eeeER .    ..   fmul	d0, d0, d0
-# CHECK-NEXT: [1,3]     D========================eeeeER    ..   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [1,4]     D==========================eeeeER  ..   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [1,5]     D==============================eeeeER   fmadd	d0, d0, d1, d2
+# CHECK-NEXT: [1,0]     .D================eeER   .    .    ..   fadd	d0, d0, d0
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .    ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,2]     .D======================eeeER .    ..   fmul	d0, d0, d0
+# CHECK-NEXT: [1,3]     .D=======================eeeeER    ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,4]     .D=========================eeeeER  ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,5]     .D=============================eeeeER   fmadd	d0, d0, d1, d2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -420,13 +420,13 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.5    0.5    0.0       fadd	d0, d0, d0
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fmadd	d0, d1, d2, d0
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fmul	d0, d0, d0
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmadd	d0, d1, d2, d0
-# CHECK-NEXT: 4.     2     18.5   0.0    0.0       fmadd	d0, d1, d2, d0
-# CHECK-NEXT: 5.     2     22.5   0.0    0.0       fmadd	d0, d0, d1, d2
-# CHECK-NEXT:        2     15.7   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       fadd	d0, d0, d0
+# CHECK-NEXT: 1.     2     11.0   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 2.     2     15.0   0.0    0.0       fmul	d0, d0, d0
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 4.     2     18.0   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 5.     2     22.0   0.0    0.0       fmadd	d0, d0, d1, d2
+# CHECK-NEXT:        2     15.2   0.1    0.0       <total>
 
 # CHECK:      [3] Code Region - saba
 
@@ -435,7 +435,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -450,8 +450,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   saba	v0.4s, v0.4s, v1.4s
 # CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: [1,1]     D=================eeeeER .  .   saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,2]     D==================eeeeER.  .   saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,3]     D======================eeeeER   saba	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   saba	v0.4s, v0.4s, v1.4s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -462,9 +462,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: 1.     2     11.5   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 2.     2     12.5   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       saba	v0.4s, v0.4s, v1.4s
-# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       saba	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
 # CHECK:      [4] Code Region - sdot
 
@@ -473,7 +473,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1103
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.36
 # CHECK-NEXT: Block RThroughput: 0.8
@@ -488,8 +488,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D========eeeER .    .   .   sdot	v0.4s, v0.16b, v1.16b
 # CHECK-NEXT: [1,0]     D===========eeeeER  .   .   mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: [1,1]     D===============eeeER   .   sdot	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,2]     D================eeeER  .   sdot	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,3]     D===================eeeER   sdot	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: [1,2]     .D===============eeeER  .   sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,3]     .D==================eeeER   sdot	v0.4s, v0.16b, v1.16b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -500,9 +500,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: 1.     2     10.5   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 2.     2     11.5   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 3.     2     14.5   0.0    0.0       sdot	v0.4s, v0.16b, v1.16b
-# CHECK-NEXT:        2     10.8   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     11.0   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 3.     2     14.0   0.0    0.0       sdot	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT:        2     10.5   0.1    0.0       <total>
 
 # CHECK:      [5] Code Region - smmla
 
@@ -511,7 +511,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1103
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.36
 # CHECK-NEXT: Block RThroughput: 0.8
@@ -526,8 +526,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D========eeeER .    .   .   smmla	v0.4s, v0.16b, v1.16b
 # CHECK-NEXT: [1,0]     D===========eeeeER  .   .   mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: [1,1]     D===============eeeER   .   smmla	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,2]     D================eeeER  .   smmla	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,3]     D===================eeeER   smmla	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: [1,2]     .D===============eeeER  .   smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,3]     .D==================eeeER   smmla	v0.4s, v0.16b, v1.16b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -538,9 +538,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: 1.     2     10.5   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 2.     2     11.5   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 3.     2     14.5   0.0    0.0       smmla	v0.4s, v0.16b, v1.16b
-# CHECK-NEXT:        2     10.8   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     11.0   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 3.     2     14.0   0.0    0.0       smmla	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT:        2     10.5   0.1    0.0       <total>
 
 # CHECK:      [6] Code Region - mla
 
@@ -549,7 +549,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
@@ -564,8 +564,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   mla	v0.4s, v0.4s, v1.4s
 # CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: [1,1]     D=================eeeeER .  .   mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,2]     D==================eeeeER.  .   mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,3]     D======================eeeeER   mla	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   mla	v0.4s, v0.4s, v1.4s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -576,9 +576,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: 1.     2     11.5   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 2.     2     12.5   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       mla	v0.4s, v0.4s, v1.4s
-# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       mla	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
 # CHECK:      [7] Code Region - sqrdmlah
 
@@ -587,7 +587,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.29
 # CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 3.5
@@ -602,8 +602,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   sqrdmlah	v0.4s, v0.4s, v1.4s
 # CHECK-NEXT: [1,0]     D==============eeeeER    .    .   mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: [1,1]     D==================eeeeER.    .   sqrdmlah	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,2]     D====================eeeeER   .   sqrdmlah	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,3]     D========================eeeeER   sqrdmlah	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   sqrdmlah	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   sqrdmlah	v0.4s, v0.4s, v1.4s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -614,9 +614,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: 1.     2     12.0   0.0    0.0       sqrdmlah	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 2.     2     14.0   0.0    0.0       sqrdmlah	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 3.     2     18.0   0.0    0.0       sqrdmlah	v0.4s, v0.4s, v1.4s
-# CHECK-NEXT:        2     13.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       sqrdmlah	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       sqrdmlah	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [8] Code Region - smlal2
 
@@ -625,7 +625,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
@@ -640,8 +640,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   smlal2	v0.4s, v0.8h, v1.8h
 # CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: [1,1]     D=================eeeeER .  .   smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2]     D==================eeeeER.  .   smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3]     D======================eeeeER   smlal2	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   smlal2	v0.4s, v0.8h, v1.8h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -652,9 +652,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: 1.     2     11.5   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2.     2     12.5   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       smlal2	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       smlal2	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
 # CHECK:      [9] Code Region - sadalp
 
@@ -663,7 +663,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -678,8 +678,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   sadalp	v0.2d, v0.4s
 # CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: [1,1]     D=================eeeeER .  .   sadalp	v0.2d, v1.4s
-# CHECK-NEXT: [1,2]     D==================eeeeER.  .   sadalp	v0.2d, v1.4s
-# CHECK-NEXT: [1,3]     D======================eeeeER   sadalp	v0.2d, v0.4s
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   sadalp	v0.2d, v0.4s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -690,9 +690,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: 1.     2     11.5   0.0    0.0       sadalp	v0.2d, v1.4s
-# CHECK-NEXT: 2.     2     12.5   0.0    0.0       sadalp	v0.2d, v1.4s
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       sadalp	v0.2d, v0.4s
-# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       sadalp	v0.2d, v1.4s
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       sadalp	v0.2d, v0.4s
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
 # CHECK:      [10] Code Region - ssra
 
@@ -701,7 +701,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -716,8 +716,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   ssra	v0.2d, v0.2d, #1
 # CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: [1,1]     D=================eeeeER .  .   ssra	v0.2d, v1.2d, #1
-# CHECK-NEXT: [1,2]     D==================eeeeER.  .   ssra	v0.2d, v1.2d, #1
-# CHECK-NEXT: [1,3]     D======================eeeeER   ssra	v0.2d, v0.2d, #1
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   ssra	v0.2d, v0.2d, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -728,9 +728,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: 1.     2     11.5   0.0    0.0       ssra	v0.2d, v1.2d, #1
-# CHECK-NEXT: 2.     2     12.5   0.0    0.0       ssra	v0.2d, v1.2d, #1
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       ssra	v0.2d, v0.2d, #1
-# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       ssra	v0.2d, v0.2d, #1
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
 # CHECK:      [11] Code Region - fcmla
 
@@ -739,7 +739,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -754,8 +754,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fcmla	v0.2d, v0.2d, v1.2d, #90
 # CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: [1,1]     D================eeeeER  .  .   fcmla	v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: [1,2]     D==================eeeeER.  .   fcmla	v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: [1,3]     D======================eeeeER   fcmla	v0.2d, v0.2d, v1.2d, #90
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   fcmla	v0.2d, v0.2d, v1.2d, #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -766,9 +766,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: 1.     2     10.5   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: 2.     2     12.5   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fcmla	v0.2d, v0.2d, v1.2d, #90
-# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       fcmla	v0.2d, v0.2d, v1.2d, #90
+# CHECK-NEXT:        2     11.5   0.1    0.0       <total>
 
 # CHECK:      [12] Code Region - fmla
 
@@ -777,7 +777,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1703
 # CHECK-NEXT: Total uOps:        600
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.35
 # CHECK-NEXT: IPC:               0.35
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -792,12 +792,12 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=======eeeeER .    .    .    .    ..   fmla	v0.2d, v1.2d, v2.2d
 # CHECK-NEXT: [0,4]     D=========eeeeER    .    .    .    ..   fmla	v0.2d, v1.2d, v2.2d
 # CHECK-NEXT: [0,5]     D=============eeeeER.    .    .    ..   fmla	v0.2d, v0.2d, v1.2d
-# CHECK-NEXT: [1,0]     D=================eeeER  .    .    ..   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1]     D==================eeeeER.    .    ..   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,2]     D======================eeER   .    ..   fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,3]     D========================eeeeER    ..   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,4]     D==========================eeeeER  ..   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,5]     D==============================eeeeER   fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: [1,0]     .D================eeeER  .    .    ..   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     .D=================eeeeER.    .    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,2]     .D=====================eeER   .    ..   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3]     .D=======================eeeeER    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,4]     .D=========================eeeeER  ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,5]     .D=============================eeeeER   fmla	v0.2d, v0.2d, v1.2d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -806,13 +806,13 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1.     2     10.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 2.     2     14.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 4.     2     18.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 5.     2     22.5   0.0    0.0       fmla	v0.2d, v0.2d, v1.2d
-# CHECK-NEXT:        2     15.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     10.0   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 2.     2     14.0   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 4.     2     18.0   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 5.     2     22.0   0.0    0.0       fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT:        2     14.8   0.1    0.0       <total>
 
 # CHECK:      [13] Code Region - fmlal
 
@@ -821,7 +821,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1903
 # CHECK-NEXT: Total uOps:        600
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.32
 # CHECK-NEXT: IPC:               0.32
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -836,12 +836,12 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeeER    .    .    .    .    .   fmlal	v0.4s, v1.4h, v2.4h
 # CHECK-NEXT: [0,4]     D===========eeeeER  .    .    .    .    .   fmlal	v0.4s, v1.4h, v2.4h
 # CHECK-NEXT: [0,5]     D===============eeeeER   .    .    .    .   fmlal	v0.4s, v0.4h, v1.4h
-# CHECK-NEXT: [1,0]     D===================eeeER.    .    .    .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1]     D======================eeeeER .    .    .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,2]     D==========================eeER    .    .   fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,3]     D============================eeeeER.    .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,4]     D==============================eeeeER   .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,5]     D==================================eeeeER   fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: [1,0]     .D==================eeeER.    .    .    .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     .D=====================eeeeER .    .    .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,2]     .D=========================eeER    .    .   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3]     .D===========================eeeeER.    .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,4]     .D=============================eeeeER   .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,5]     .D=================================eeeeER   fmlal	v0.4s, v0.4h, v1.4h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -850,13 +850,13 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     10.5   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1.     2     13.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 2.     2     17.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 4.     2     21.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 5.     2     25.5   0.0    0.0       fmlal	v0.4s, v0.4h, v1.4h
-# CHECK-NEXT:        2     18.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     10.0   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 2.     2     17.0   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3.     2     19.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 4.     2     21.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 5.     2     25.0   0.0    0.0       fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT:        2     17.5   0.1    0.0       <total>
 
 # CHECK:      [14] Code Region - bfdot
 
@@ -865,7 +865,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1603
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.25
 # CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -880,8 +880,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D===========eeeeeER .    .    .   .   bfdot	v0.4s, v0.8h, v1.8h
 # CHECK-NEXT: [1,0]     D================eeeER   .    .   .   fmul	v0.2d, v0.2d, v0.2d
 # CHECK-NEXT: [1,1]     D===================eeeeeER   .   .   bfdot	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2]     D======================eeeeeER.   .   bfdot	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3]     D===========================eeeeeER   bfdot	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,2]     .D=====================eeeeeER.   .   bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     .D==========================eeeeeER   bfdot	v0.4s, v0.8h, v1.8h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -892,9 +892,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     9.0    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
 # CHECK-NEXT: 1.     2     12.0   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2.     2     15.0   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3.     2     20.0   0.0    0.0       bfdot	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT:        2     14.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     14.5   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       bfdot	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
 
 # CHECK:      [15] Code Region - bfmmla
 
@@ -903,7 +903,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1903
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.21
 # CHECK-NEXT: IPC:               0.21
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -918,8 +918,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=============eeeeeeER   .    .    .    .   bfmmla	v0.4s, v0.8h, v1.8h
 # CHECK-NEXT: [1,0]     D===================eeeER.    .    .    .   fmul	v0.2d, v0.2d, v0.2d
 # CHECK-NEXT: [1,1]     D======================eeeeeeER    .    .   bfmmla	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2]     D==========================eeeeeeER.    .   bfmmla	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3]     D================================eeeeeeER   bfmmla	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,2]     .D=========================eeeeeeER.    .   bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     .D===============================eeeeeeER   bfmmla	v0.4s, v0.8h, v1.8h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -930,9 +930,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     10.5   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
 # CHECK-NEXT: 1.     2     13.5   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2.     2     17.5   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3.     2     23.5   0.0    0.0       bfmmla	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     17.0   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     23.0   0.0    0.0       bfmmla	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
 
 # CHECK:      [16] Code Region - bfmlalb
 
@@ -941,7 +941,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1503
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.27
 # CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -956,8 +956,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    . .   bfmlalb	v0.4s, v0.8h, v1.8h
 # CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	v0.2d, v0.2d, v0.2d
 # CHECK-NEXT: [1,1]     D==================eeeeeER    . .   bfmlalb	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2]     D====================eeeeeER  . .   bfmlalb	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3]     D=========================eeeeeER   bfmlalb	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,2]     .D===================eeeeeER  . .   bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     .D========================eeeeeER   bfmlalb	v0.4s, v0.8h, v1.8h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -968,9 +968,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
 # CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2.     2     13.5   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3.     2     18.5   0.0    0.0       bfmlalb	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT:        2     13.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     13.0   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     18.0   0.0    0.0       bfmlalb	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [17] Code Region - crc32b
 
@@ -979,7 +979,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.57
 # CHECK-NEXT: IPC:               0.57
 # CHECK-NEXT: Block RThroughput: 3.0
@@ -994,8 +994,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=====eeER.    ..   crc32b	w0, w0, w0
 # CHECK-NEXT: [1,0]     D=======eeER   ..   mul	w0, w0, w0
 # CHECK-NEXT: [1,1]     D=========eeER ..   crc32b	w0, w0, w1
-# CHECK-NEXT: [1,2]     D==========eeER..   crc32b	w0, w0, w1
-# CHECK-NEXT: [1,3]     D============eeER   crc32b	w0, w0, w0
+# CHECK-NEXT: [1,2]     .D=========eeER..   crc32b	w0, w0, w1
+# CHECK-NEXT: [1,3]     .D===========eeER   crc32b	w0, w0, w0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1006,9 +1006,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	w0, w0, w0
 # CHECK-NEXT: 1.     2     6.5    0.0    0.0       crc32b	w0, w0, w1
-# CHECK-NEXT: 2.     2     7.5    0.0    0.0       crc32b	w0, w0, w1
-# CHECK-NEXT: 3.     2     9.5    0.0    0.0       crc32b	w0, w0, w0
-# CHECK-NEXT:        2     7.0    0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     7.0    0.0    0.0       crc32b	w0, w0, w1
+# CHECK-NEXT: 3.     2     9.0    0.0    0.0       crc32b	w0, w0, w0
+# CHECK-NEXT:        2     6.8    0.1    0.0       <total>
 
 # CHECK:      [18] Code Region - Z saba
 
@@ -1017,7 +1017,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -1030,10 +1030,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   saba	z0.d, z1.d, z2.d
 # CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   saba	z0.d, z1.d, z2.d
 # CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   saba	z0.d, z0.d, z1.d
-# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D===================eeeeER    .   saba	z0.d, z1.d, z2.d
-# CHECK-NEXT: [1,2]     D====================eeeeER   .   saba	z0.d, z1.d, z2.d
-# CHECK-NEXT: [1,3]     D========================eeeeER   saba	z0.d, z0.d, z1.d
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   saba	z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   saba	z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   saba	z0.d, z0.d, z1.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1042,11 +1042,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       saba	z0.d, z1.d, z2.d
-# CHECK-NEXT: 2.     2     14.0   0.0    0.0       saba	z0.d, z1.d, z2.d
-# CHECK-NEXT: 3.     2     18.0   0.0    0.0       saba	z0.d, z0.d, z1.d
-# CHECK-NEXT:        2     13.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       saba	z0.d, z1.d, z2.d
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       saba	z0.d, z1.d, z2.d
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       saba	z0.d, z0.d, z1.d
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [19] Code Region - Z sadalp
 
@@ -1055,7 +1055,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -1068,10 +1068,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   sadalp	z0.d, p0/m, z1.s
 # CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   sadalp	z0.d, p0/m, z1.s
 # CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   sadalp	z0.d, p0/m, z0.s
-# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D===================eeeeER    .   sadalp	z0.d, p0/m, z1.s
-# CHECK-NEXT: [1,2]     D====================eeeeER   .   sadalp	z0.d, p0/m, z1.s
-# CHECK-NEXT: [1,3]     D========================eeeeER   sadalp	z0.d, p0/m, z0.s
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   sadalp	z0.d, p0/m, z1.s
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   sadalp	z0.d, p0/m, z1.s
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   sadalp	z0.d, p0/m, z0.s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1080,11 +1080,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sadalp	z0.d, p0/m, z1.s
-# CHECK-NEXT: 2.     2     14.0   0.0    0.0       sadalp	z0.d, p0/m, z1.s
-# CHECK-NEXT: 3.     2     18.0   0.0    0.0       sadalp	z0.d, p0/m, z0.s
-# CHECK-NEXT:        2     13.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       sadalp	z0.d, p0/m, z1.s
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       sadalp	z0.d, p0/m, z1.s
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       sadalp	z0.d, p0/m, z0.s
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [20] Code Region - Z ssra
 
@@ -1093,7 +1093,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -1106,10 +1106,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   ssra	z0.d, z1.d, #1
 # CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   ssra	z0.d, z1.d, #1
 # CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   ssra	z0.d, z0.d, #1
-# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D===================eeeeER    .   ssra	z0.d, z1.d, #1
-# CHECK-NEXT: [1,2]     D====================eeeeER   .   ssra	z0.d, z1.d, #1
-# CHECK-NEXT: [1,3]     D========================eeeeER   ssra	z0.d, z0.d, #1
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   ssra	z0.d, z1.d, #1
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   ssra	z0.d, z1.d, #1
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   ssra	z0.d, z0.d, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1118,11 +1118,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       ssra	z0.d, z1.d, #1
-# CHECK-NEXT: 2.     2     14.0   0.0    0.0       ssra	z0.d, z1.d, #1
-# CHECK-NEXT: 3.     2     18.0   0.0    0.0       ssra	z0.d, z0.d, #1
-# CHECK-NEXT:        2     13.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       ssra	z0.d, z1.d, #1
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       ssra	z0.d, z1.d, #1
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       ssra	z0.d, z0.d, #1
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [21] Code Region - Z cdot.s
 
@@ -1131,7 +1131,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.42
 # CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -1144,10 +1144,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeER    .    .    ..   cdot	z0.s, z1.b, z2.b, #90
 # CHECK-NEXT: [0,2]     D======eeeER   .    .    ..   cdot	z0.s, z1.b, z2.b, #90
 # CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   cdot	z0.s, z0.b, z1.b, #90
-# CHECK-NEXT: [1,0]     D============eeeeeER.    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D=================eeeER  ..   cdot	z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: [1,2]     D==================eeeER ..   cdot	z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: [1,3]     D=====================eeeER   cdot	z0.s, z0.b, z1.b, #90
+# CHECK-NEXT: [1,0]     .D===========eeeeeER.    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D================eeeER  ..   cdot	z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: [1,2]     .D=================eeeER ..   cdot	z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: [1,3]     .D====================eeeER   cdot	z0.s, z0.b, z1.b, #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1156,11 +1156,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     12.0   0.0    0.0       cdot	z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: 2.     2     13.0   0.0    0.0       cdot	z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: 3.     2     16.0   0.0    0.0       cdot	z0.s, z0.b, z1.b, #90
-# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       cdot	z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       cdot	z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: 3.     2     15.5   0.0    0.0       cdot	z0.s, z0.b, z1.b, #90
+# CHECK-NEXT:        2     11.5   0.1    0.0       <total>
 
 # CHECK:      [22] Code Region - Z cdot.d
 
@@ -1169,7 +1169,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -1182,10 +1182,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   cdot	z0.d, z1.h, z2.h, #90
 # CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   cdot	z0.d, z1.h, z2.h, #90
 # CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   cdot	z0.d, z0.h, z1.h, #90
-# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D===================eeeeER    .   cdot	z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: [1,2]     D====================eeeeER   .   cdot	z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: [1,3]     D========================eeeeER   cdot	z0.d, z0.h, z1.h, #90
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   cdot	z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   cdot	z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   cdot	z0.d, z0.h, z1.h, #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1194,11 +1194,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       cdot	z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: 2.     2     14.0   0.0    0.0       cdot	z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: 3.     2     18.0   0.0    0.0       cdot	z0.d, z0.h, z1.h, #90
-# CHECK-NEXT:        2     13.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       cdot	z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       cdot	z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       cdot	z0.d, z0.h, z1.h, #90
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [23] Code Region - Z cmla.b
 
@@ -1207,7 +1207,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -1220,10 +1220,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   cmla	z0.b, z1.b, z2.b, #90
 # CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   cmla	z0.b, z1.b, z2.b, #90
 # CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   cmla	z0.b, z0.b, z1.b, #90
-# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D===================eeeeER    .   cmla	z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: [1,2]     D====================eeeeER   .   cmla	z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: [1,3]     D========================eeeeER   cmla	z0.b, z0.b, z1.b, #90
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   cmla	z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   cmla	z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   cmla	z0.b, z0.b, z1.b, #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1232,11 +1232,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       cmla	z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: 2.     2     14.0   0.0    0.0       cmla	z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: 3.     2     18.0   0.0    0.0       cmla	z0.b, z0.b, z1.b, #90
-# CHECK-NEXT:        2     13.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       cmla	z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       cmla	z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       cmla	z0.b, z0.b, z1.b, #90
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [24] Code Region - Z cmla.d
 
@@ -1245,7 +1245,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1803
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.28
 # CHECK-NEXT: IPC:               0.22
 # CHECK-NEXT: Block RThroughput: 4.0
@@ -1258,10 +1258,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .  .   cmla	z0.d, z1.d, z2.d, #90
 # CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   cmla	z0.d, z1.d, z2.d, #90
 # CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   cmla	z0.d, z0.d, z1.d, #90
-# CHECK-NEXT: [1,0]     D==================eeeeeER    .    .  .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D=======================eeeeeER    .  .   cmla	z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   cmla	z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: [1,3]     D===============================eeeeeER   cmla	z0.d, z0.d, z1.d, #90
+# CHECK-NEXT: [1,0]     .D=================eeeeeER    .    .  .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D======================eeeeeER    .  .   cmla	z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: [1,2]     .D=========================eeeeeER .  .   cmla	z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: [1,3]     .D==============================eeeeeER   cmla	z0.d, z0.d, z1.d, #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1270,11 +1270,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     10.0   0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     15.0   0.0    0.0       cmla	z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: 2.     2     18.0   0.0    0.0       cmla	z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: 3.     2     23.0   0.0    0.0       cmla	z0.d, z0.d, z1.d, #90
-# CHECK-NEXT:        2     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       cmla	z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: 2.     2     17.5   0.0    0.0       cmla	z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: 3.     2     22.5   0.0    0.0       cmla	z0.d, z0.d, z1.d, #90
+# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
 
 # CHECK:      [25] Code Region - Z sdot.s
 
@@ -1283,7 +1283,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.42
 # CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -1296,10 +1296,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeER    .    .    ..   sdot	z0.s, z1.b, z2.b
 # CHECK-NEXT: [0,2]     D======eeeER   .    .    ..   sdot	z0.s, z1.b, z2.b
 # CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   sdot	z0.s, z0.b, z1.b
-# CHECK-NEXT: [1,0]     D============eeeeeER.    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D=================eeeER  ..   sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,2]     D==================eeeER ..   sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,3]     D=====================eeeER   sdot	z0.s, z0.b, z1.b
+# CHECK-NEXT: [1,0]     .D===========eeeeeER.    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D================eeeER  ..   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,2]     .D=================eeeER ..   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,3]     .D====================eeeER   sdot	z0.s, z0.b, z1.b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1308,11 +1308,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     12.0   0.0    0.0       sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: 2.     2     13.0   0.0    0.0       sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: 3.     2     16.0   0.0    0.0       sdot	z0.s, z0.b, z1.b
-# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: 3.     2     15.5   0.0    0.0       sdot	z0.s, z0.b, z1.b
+# CHECK-NEXT:        2     11.5   0.1    0.0       <total>
 
 # CHECK:      [26] Code Region - Z sudot
 
@@ -1321,7 +1321,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.42
 # CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -1334,10 +1334,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeER    .    .    ..   sdot	z0.s, z1.b, z2.b[1]
 # CHECK-NEXT: [0,2]     D======eeeER   .    .    ..   sdot	z0.s, z1.b, z2.b[1]
 # CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   sdot	z0.s, z0.b, z1.b[1]
-# CHECK-NEXT: [1,0]     D============eeeeeER.    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D=================eeeER  ..   sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: [1,2]     D==================eeeER ..   sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: [1,3]     D=====================eeeER   sdot	z0.s, z0.b, z1.b[1]
+# CHECK-NEXT: [1,0]     .D===========eeeeeER.    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D================eeeER  ..   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [1,2]     .D=================eeeER ..   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [1,3]     .D====================eeeER   sdot	z0.s, z0.b, z1.b[1]
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1346,11 +1346,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     12.0   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: 2.     2     13.0   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: 3.     2     16.0   0.0    0.0       sdot	z0.s, z0.b, z1.b[1]
-# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: 3.     2     15.5   0.0    0.0       sdot	z0.s, z0.b, z1.b[1]
+# CHECK-NEXT:        2     11.5   0.1    0.0       <total>
 
 # CHECK:      [27] Code Region - Z sdot.d
 
@@ -1359,7 +1359,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -1372,10 +1372,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   sdot	z0.d, z1.h, z2.h
 # CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   sdot	z0.d, z1.h, z2.h
 # CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   sdot	z0.d, z0.h, z1.h
-# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D===================eeeeER    .   sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: [1,2]     D====================eeeeER   .   sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: [1,3]     D========================eeeeER   sdot	z0.d, z0.h, z1.h
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   sdot	z0.d, z0.h, z1.h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1384,11 +1384,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: 2.     2     14.0   0.0    0.0       sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: 3.     2     18.0   0.0    0.0       sdot	z0.d, z0.h, z1.h
-# CHECK-NEXT:        2     13.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       sdot	z0.d, z0.h, z1.h
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [28] Code Region - Z smmla
 
@@ -1397,7 +1397,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1103
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.36
 # CHECK-NEXT: Block RThroughput: 0.8
@@ -1412,8 +1412,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D========eeeER .    .   .   smmla	z0.s, z0.b, z1.b
 # CHECK-NEXT: [1,0]     D===========eeeeER  .   .   mul	z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D===============eeeER   .   smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,2]     D================eeeER  .   smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,3]     D===================eeeER   smmla	z0.s, z0.b, z1.b
+# CHECK-NEXT: [1,2]     .D===============eeeER  .   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,3]     .D==================eeeER   smmla	z0.s, z0.b, z1.b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1424,9 +1424,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	z0.s, z0.s, z0.s
 # CHECK-NEXT: 1.     2     10.5   0.0    0.0       smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: 2.     2     11.5   0.0    0.0       smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: 3.     2     14.5   0.0    0.0       smmla	z0.s, z0.b, z1.b
-# CHECK-NEXT:        2     10.8   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     11.0   0.0    0.0       smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: 3.     2     14.0   0.0    0.0       smmla	z0.s, z0.b, z1.b
+# CHECK-NEXT:        2     10.5   0.1    0.0       <total>
 
 # CHECK:      [29] Code Region - Z mla.b
 
@@ -1435,7 +1435,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 4.0
@@ -1448,10 +1448,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   mla	z0.b, p0/m, z1.b, z2.b
 # CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   mla	z0.b, p0/m, z1.b, z2.b
 # CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   mla	z0.b, p0/m, z0.b, z1.b
-# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D===================eeeeER    .   mla	z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: [1,2]     D====================eeeeER   .   mla	z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: [1,3]     D========================eeeeER   mla	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   mla	z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   mla	z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   mla	z0.b, p0/m, z0.b, z1.b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1460,11 +1460,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       mla	z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: 2.     2     14.0   0.0    0.0       mla	z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: 3.     2     18.0   0.0    0.0       mla	z0.b, p0/m, z0.b, z1.b
-# CHECK-NEXT:        2     13.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       mla	z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       mla	z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       mla	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [30] Code Region - Z mla.d
 
@@ -1473,7 +1473,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1803
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.28
 # CHECK-NEXT: IPC:               0.22
 # CHECK-NEXT: Block RThroughput: 4.0
@@ -1486,10 +1486,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .  .   mla	z0.d, p0/m, z1.d, z2.d
 # CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   mla	z0.d, p0/m, z1.d, z2.d
 # CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   mla	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT: [1,0]     D==================eeeeeER    .    .  .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D=======================eeeeeER    .  .   mla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   mla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,3]     D===============================eeeeeER   mla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0]     .D=================eeeeeER    .    .  .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D======================eeeeeER    .  .   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2]     .D=========================eeeeeER .  .   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D==============================eeeeeER   mla	z0.d, p0/m, z0.d, z1.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1498,11 +1498,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     10.0   0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     15.0   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 2.     2     18.0   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 3.     2     23.0   0.0    0.0       mla	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT:        2     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2.     2     17.5   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3.     2     22.5   0.0    0.0       mla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
 
 # CHECK:      [31] Code Region - Z smlalb
 
@@ -1511,7 +1511,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -1524,10 +1524,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   smlalb	z0.d, z1.s, z2.s
 # CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   smlalb	z0.d, z1.s, z2.s
 # CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   smlalb	z0.d, z0.s, z1.s
-# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D===================eeeeER    .   smlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: [1,2]     D====================eeeeER   .   smlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: [1,3]     D========================eeeeER   smlalb	z0.d, z0.s, z1.s
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   smlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   smlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   smlalb	z0.d, z0.s, z1.s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1536,11 +1536,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       smlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: 2.     2     14.0   0.0    0.0       smlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: 3.     2     18.0   0.0    0.0       smlalb	z0.d, z0.s, z1.s
-# CHECK-NEXT:        2     13.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       smlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       smlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       smlalb	z0.d, z0.s, z1.s
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [32] Code Region - Z sqdmlalb
 
@@ -1549,7 +1549,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1503
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.33
 # CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -1562,10 +1562,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    . .   sqdmlalb	z0.d, z1.s, z2.s
 # CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   sqdmlalb	z0.d, z1.s, z2.s
 # CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   sqdmlalb	z0.d, z0.s, z1.s
-# CHECK-NEXT: [1,0]     D===============eeeeeER  .    . .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D====================eeeeER   . .   sqdmlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: [1,2]     D======================eeeeER . .   sqdmlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: [1,3]     D==========================eeeeER   sqdmlalb	z0.d, z0.s, z1.s
+# CHECK-NEXT: [1,0]     .D==============eeeeeER  .    . .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D===================eeeeER   . .   sqdmlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,2]     .D=====================eeeeER . .   sqdmlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,3]     .D=========================eeeeER   sqdmlalb	z0.d, z0.s, z1.s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1574,11 +1574,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.5    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.5   0.0    0.0       sqdmlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       sqdmlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       sqdmlalb	z0.d, z0.s, z1.s
-# CHECK-NEXT:        2     14.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sqdmlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: 2.     2     15.0   0.0    0.0       sqdmlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: 3.     2     19.0   0.0    0.0       sqdmlalb	z0.d, z0.s, z1.s
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
 
 # CHECK:      [33] Code Region - Z sqrdmlah.b
 
@@ -1587,7 +1587,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1503
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.33
 # CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -1600,10 +1600,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    . .   sqrdmlah	z0.b, z1.b, z2.b
 # CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   sqrdmlah	z0.b, z1.b, z2.b
 # CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   sqrdmlah	z0.b, z0.b, z1.b
-# CHECK-NEXT: [1,0]     D===============eeeeeER  .    . .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D====================eeeeER   . .   sqrdmlah	z0.b, z1.b, z2.b
-# CHECK-NEXT: [1,2]     D======================eeeeER . .   sqrdmlah	z0.b, z1.b, z2.b
-# CHECK-NEXT: [1,3]     D==========================eeeeER   sqrdmlah	z0.b, z0.b, z1.b
+# CHECK-NEXT: [1,0]     .D==============eeeeeER  .    . .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D===================eeeeER   . .   sqrdmlah	z0.b, z1.b, z2.b
+# CHECK-NEXT: [1,2]     .D=====================eeeeER . .   sqrdmlah	z0.b, z1.b, z2.b
+# CHECK-NEXT: [1,3]     .D=========================eeeeER   sqrdmlah	z0.b, z0.b, z1.b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1612,11 +1612,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.5    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.5   0.0    0.0       sqrdmlah	z0.b, z1.b, z2.b
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       sqrdmlah	z0.b, z1.b, z2.b
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       sqrdmlah	z0.b, z0.b, z1.b
-# CHECK-NEXT:        2     14.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sqrdmlah	z0.b, z1.b, z2.b
+# CHECK-NEXT: 2.     2     15.0   0.0    0.0       sqrdmlah	z0.b, z1.b, z2.b
+# CHECK-NEXT: 3.     2     19.0   0.0    0.0       sqrdmlah	z0.b, z0.b, z1.b
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
 
 # CHECK:      [34] Code Region - Z sqrdmlah.d
 
@@ -1625,7 +1625,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1803
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.28
 # CHECK-NEXT: IPC:               0.22
 # CHECK-NEXT: Block RThroughput: 4.0
@@ -1638,10 +1638,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .  .   sqrdmlah	z0.d, z1.d, z2.d
 # CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   sqrdmlah	z0.d, z1.d, z2.d
 # CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   sqrdmlah	z0.d, z0.d, z1.d
-# CHECK-NEXT: [1,0]     D==================eeeeeER    .    .  .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D=======================eeeeeER    .  .   sqrdmlah	z0.d, z1.d, z2.d
-# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   sqrdmlah	z0.d, z1.d, z2.d
-# CHECK-NEXT: [1,3]     D===============================eeeeeER   sqrdmlah	z0.d, z0.d, z1.d
+# CHECK-NEXT: [1,0]     .D=================eeeeeER    .    .  .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D======================eeeeeER    .  .   sqrdmlah	z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,2]     .D=========================eeeeeER .  .   sqrdmlah	z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D==============================eeeeeER   sqrdmlah	z0.d, z0.d, z1.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1650,11 +1650,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     10.0   0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     15.0   0.0    0.0       sqrdmlah	z0.d, z1.d, z2.d
-# CHECK-NEXT: 2.     2     18.0   0.0    0.0       sqrdmlah	z0.d, z1.d, z2.d
-# CHECK-NEXT: 3.     2     23.0   0.0    0.0       sqrdmlah	z0.d, z0.d, z1.d
-# CHECK-NEXT:        2     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       sqrdmlah	z0.d, z1.d, z2.d
+# CHECK-NEXT: 2.     2     17.5   0.0    0.0       sqrdmlah	z0.d, z1.d, z2.d
+# CHECK-NEXT: 3.     2     22.5   0.0    0.0       sqrdmlah	z0.d, z0.d, z1.d
+# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
 
 # CHECK:      [35] Code Region - Z fcmla ZPmZZ
 
@@ -1663,7 +1663,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1503
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.27
 # CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -1678,8 +1678,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    . .   fcmla	z0.d, p0/m, z0.d, z1.d, #90
 # CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: [1,1]     D==================eeeeeER    . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: [1,2]     D====================eeeeeER  . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: [1,3]     D=========================eeeeeER   fcmla	z0.d, p0/m, z0.d, z1.d, #90
+# CHECK-NEXT: [1,2]     .D===================eeeeeER  . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [1,3]     .D========================eeeeeER   fcmla	z0.d, p0/m, z0.d, z1.d, #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1690,9 +1690,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: 1.     2     11.5   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: 2.     2     13.5   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: 3.     2     18.5   0.0    0.0       fcmla	z0.d, p0/m, z0.d, z1.d, #90
-# CHECK-NEXT:        2     13.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     13.0   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: 3.     2     18.0   0.0    0.0       fcmla	z0.d, p0/m, z0.d, z1.d, #90
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [36] Code Region - Z fcmla ZZZI
 
@@ -1701,7 +1701,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1503
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.27
 # CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -1716,8 +1716,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    . .   fcmla	z0.s, z0.s, z1.s[1], #90
 # CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: [1,1]     D==================eeeeeER    . .   fcmla	z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: [1,2]     D====================eeeeeER  . .   fcmla	z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: [1,3]     D=========================eeeeeER   fcmla	z0.s, z0.s, z1.s[1], #90
+# CHECK-NEXT: [1,2]     .D===================eeeeeER  . .   fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [1,3]     .D========================eeeeeER   fcmla	z0.s, z0.s, z1.s[1], #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1728,9 +1728,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: 1.     2     11.5   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: 2.     2     13.5   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: 3.     2     18.5   0.0    0.0       fcmla	z0.s, z0.s, z1.s[1], #90
-# CHECK-NEXT:        2     13.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     13.0   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: 3.     2     18.0   0.0    0.0       fcmla	z0.s, z0.s, z1.s[1], #90
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [37] Code Region - Z fmla ZPmZZ
 
@@ -1739,7 +1739,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -1754,8 +1754,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fmla	z0.d, p0/m, z0.d, z1.d
 # CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: [1,1]     D================eeeeER  .  .   fmla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,2]     D==================eeeeER.  .   fmla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,3]     D======================eeeeER   fmla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   fmla	z0.d, p0/m, z0.d, z1.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1766,9 +1766,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: 1.     2     10.5   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 2.     2     12.5   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmla	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       fmla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT:        2     11.5   0.1    0.0       <total>
 
 # CHECK:      [38] Code Region - Z fmla ZZZI
 
@@ -1777,7 +1777,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -1792,8 +1792,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fmla	z0.d, z0.d, z1.d[1]
 # CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: [1,1]     D================eeeeER  .  .   fmla	z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: [1,2]     D==================eeeeER.  .   fmla	z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: [1,3]     D======================eeeeER   fmla	z0.d, z0.d, z1.d[1]
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   fmla	z0.d, z0.d, z1.d[1]
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1804,9 +1804,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: 1.     2     10.5   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: 2.     2     12.5   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmla	z0.d, z0.d, z1.d[1]
-# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       fmla	z0.d, z0.d, z1.d[1]
+# CHECK-NEXT:        2     11.5   0.1    0.0       <total>
 
 # CHECK:      [39] Code Region - Z fmlalb ZZZ
 
@@ -1815,7 +1815,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -1830,8 +1830,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fmlalb	z0.s, z0.h, z1.h
 # CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: [1,1]     D================eeeeER  .  .   fmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2]     D==================eeeeER.  .   fmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3]     D======================eeeeER   fmlalb	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   fmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   fmlalb	z0.s, z0.h, z1.h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1842,9 +1842,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: 1.     2     10.5   0.0    0.0       fmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: 2.     2     12.5   0.0    0.0       fmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmlalb	z0.s, z0.h, z1.h
-# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       fmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       fmlalb	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     11.5   0.1    0.0       <total>
 
 # CHECK:      [40] Code Region - Z bfdot
 
@@ -1853,7 +1853,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1603
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.25
 # CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -1868,8 +1868,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D===========eeeeeER .    .    .   .   bfdot	z0.s, z0.h, z1.h
 # CHECK-NEXT: [1,0]     D================eeeER   .    .   .   fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: [1,1]     D===================eeeeeER   .   .   bfdot	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2]     D======================eeeeeER.   .   bfdot	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3]     D===========================eeeeeER   bfdot	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,2]     .D=====================eeeeeER.   .   bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     .D==========================eeeeeER   bfdot	z0.s, z0.h, z1.h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1880,9 +1880,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     9.0    0.5    0.0       fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: 1.     2     12.0   0.0    0.0       bfdot	z0.s, z1.h, z2.h
-# CHECK-NEXT: 2.     2     15.0   0.0    0.0       bfdot	z0.s, z1.h, z2.h
-# CHECK-NEXT: 3.     2     20.0   0.0    0.0       bfdot	z0.s, z0.h, z1.h
-# CHECK-NEXT:        2     14.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     14.5   0.0    0.0       bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       bfdot	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
 
 # CHECK:      [41] Code Region - Z bfmmla
 
@@ -1891,7 +1891,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1903
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.21
 # CHECK-NEXT: IPC:               0.21
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -1906,8 +1906,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=============eeeeeeER   .    .    .    .   bfmmla	z0.s, z0.h, z1.h
 # CHECK-NEXT: [1,0]     D===================eeeER.    .    .    .   fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: [1,1]     D======================eeeeeeER    .    .   bfmmla	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2]     D==========================eeeeeeER.    .   bfmmla	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3]     D================================eeeeeeER   bfmmla	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,2]     .D=========================eeeeeeER.    .   bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     .D===============================eeeeeeER   bfmmla	z0.s, z0.h, z1.h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1918,9 +1918,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     10.5   0.5    0.0       fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: 1.     2     13.5   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
-# CHECK-NEXT: 2.     2     17.5   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
-# CHECK-NEXT: 3.     2     23.5   0.0    0.0       bfmmla	z0.s, z0.h, z1.h
-# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     17.0   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     23.0   0.0    0.0       bfmmla	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
 
 # CHECK:      [42] Code Region - bfmlalb
 
@@ -1929,7 +1929,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1503
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.27
 # CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -1944,8 +1944,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    . .   bfmlalb	z0.s, z0.h, z1.h
 # CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: [1,1]     D==================eeeeeER    . .   bfmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2]     D====================eeeeeER  . .   bfmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3]     D=========================eeeeeER   bfmlalb	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,2]     .D===================eeeeeER  . .   bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     .D========================eeeeeER   bfmlalb	z0.s, z0.h, z1.h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1956,6 +1956,6 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: 2.     2     13.5   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: 3.     2     18.5   0.0    0.0       bfmlalb	z0.s, z0.h, z1.h
-# CHECK-NEXT:        2     13.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     13.0   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     18.0   0.0    0.0       bfmlalb	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
index 6cba45c..49af4df 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
@@ -5071,19 +5071,19 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  2      2     1.00                        movs	p0.b, p0/z, p0.b
 # CHECK-NEXT:  2      2     1.00                        movs	p15.b, p15.b
 # CHECK-NEXT:  2      2     1.00                        movs	p15.b, p15/z, p15.b
-# CHECK-NEXT:  1      1     0.06                  U     mrs	x3, ID_AA64ZFR0_EL1
-# CHECK-NEXT:  1      1     0.06                  U     mrs	x3, ZCR_EL1
-# CHECK-NEXT:  1      1     0.06                  U     mrs	x3, ZCR_EL12
-# CHECK-NEXT:  1      1     0.06                  U     mrs	x3, ZCR_EL2
-# CHECK-NEXT:  1      1     0.06                  U     mrs	x3, ZCR_EL3
+# CHECK-NEXT:  1      1     0.17                  U     mrs	x3, ID_AA64ZFR0_EL1
+# CHECK-NEXT:  1      1     0.17                  U     mrs	x3, ZCR_EL1
+# CHECK-NEXT:  1      1     0.17                  U     mrs	x3, ZCR_EL12
+# CHECK-NEXT:  1      1     0.17                  U     mrs	x3, ZCR_EL2
+# CHECK-NEXT:  1      1     0.17                  U     mrs	x3, ZCR_EL3
 # CHECK-NEXT:  1      4     1.00                        msb	z0.b, p7/m, z1.b, z31.b
 # CHECK-NEXT:  1      5     1.00                        msb	z0.d, p7/m, z1.d, z31.d
 # CHECK-NEXT:  1      4     1.00                        msb	z0.h, p7/m, z1.h, z31.h
 # CHECK-NEXT:  1      4     1.00                        msb	z0.s, p7/m, z1.s, z31.s
-# CHECK-NEXT:  1      1     0.06                  U     msr	ZCR_EL1, x3
-# CHECK-NEXT:  1      1     0.06                  U     msr	ZCR_EL12, x3
-# CHECK-NEXT:  1      1     0.06                  U     msr	ZCR_EL2, x3
-# CHECK-NEXT:  1      1     0.06                  U     msr	ZCR_EL3, x3
+# CHECK-NEXT:  1      1     0.17                  U     msr	ZCR_EL1, x3
+# CHECK-NEXT:  1      1     0.17                  U     msr	ZCR_EL12, x3
+# CHECK-NEXT:  1      1     0.17                  U     msr	ZCR_EL2, x3
+# CHECK-NEXT:  1      1     0.17                  U     msr	ZCR_EL3, x3
 # CHECK-NEXT:  1      4     0.50                        mul	z0.b, p7/m, z0.b, z31.b
 # CHECK-NEXT:  1      4     0.50                        mul	z0.b, z1.b, z2.b
 # CHECK-NEXT:  2      5     1.00                        mul	z0.d, p7/m, z0.d, z31.d
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-writeback.s
index 1ef7468..c7a93d1 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-writeback.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-writeback.s
@@ -733,7 +733,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    1.97
 # CHECK-NEXT: IPC:               0.98
 # CHECK-NEXT: Block RThroughput: 1.7
@@ -745,8 +745,8 @@ ldr  x2, [x1], #254
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.1d }, [x27], #8
 # CHECK-NEXT: [0,1]     D=eeeeeeER. .   ld1	{ v1.2d }, [x27], #16
 # CHECK-NEXT: [0,2]     D==eeeeeeER .   ld1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: [0,3]     D===eeeeeeER.   ld1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,4]     D====eeeeeeER   ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,3]     .D==eeeeeeER.   ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,4]     .D===eeeeeeER   ld1	{ v1.4s }, [x27], #16
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -758,9 +758,9 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d }, [x27], #8
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.2d }, [x27], #16
 # CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ld1	{ v1.4s }, [x27], #16
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT:        1     2.6    0.2    0.0       <total>
 
 # CHECK:      [1] Code Region - G02
 
@@ -769,7 +769,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    1.97
 # CHECK-NEXT: IPC:               0.98
 # CHECK-NEXT: Block RThroughput: 1.7
@@ -781,8 +781,8 @@ ldr  x2, [x1], #254
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.8b }, [x27], #8
 # CHECK-NEXT: [0,1]     D=eeeeeeER. .   ld1	{ v1.8h }, [x27], #16
 # CHECK-NEXT: [0,2]     D==eeeeeeER .   ld1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: [0,3]     D===eeeeeeER.   ld1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,4]     D====eeeeeeER   ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D==eeeeeeER.   ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,4]     .D===eeeeeeER   ld1	{ v1.2d }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -794,9 +794,9 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b }, [x27], #8
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.8h }, [x27], #16
 # CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ld1	{ v1.2d }, [x27], x28
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT:        1     2.6    0.2    0.0       <total>
 
 # CHECK:      [2] Code Region - G03
 
@@ -805,7 +805,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    1.97
 # CHECK-NEXT: IPC:               0.98
 # CHECK-NEXT: Block RThroughput: 1.7
@@ -817,8 +817,8 @@ ldr  x2, [x1], #254
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.2s }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eeeeeeER. .   ld1	{ v1.4h }, [x27], x28
 # CHECK-NEXT: [0,2]     D==eeeeeeER .   ld1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     D===eeeeeeER.   ld1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,4]     D====eeeeeeER   ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D==eeeeeeER.   ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,4]     .D===eeeeeeER   ld1	{ v1.8h }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -830,9 +830,9 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.4h }, [x27], x28
 # CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ld1	{ v1.8h }, [x27], x28
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT:        1     2.6    0.2    0.0       <total>
 
 # CHECK:      [3] Code Region - G04
 
@@ -841,7 +841,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.76
 # CHECK-NEXT: IPC:               0.98
 # CHECK-NEXT: Block RThroughput: 3.0
@@ -852,9 +852,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.16b }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eeeeeeER. .   ld1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,2]     D==eeeeeeER .   ld1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,3]     D===eeeeeeER.   ld1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,4]     D====eeeeeeER   ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,2]     .D=eeeeeeER .   ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,3]     .D==eeeeeeER.   ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,4]     . D==eeeeeeER   ld1	{ v1.4h, v2.4h }, [x27], #16
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -865,10 +865,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [4] Code Region - G05
 
@@ -877,7 +877,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               0.98
 # CHECK-NEXT: Block RThroughput: 3.3
@@ -888,9 +888,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.4s, v2.4s }, [x27], #32
 # CHECK-NEXT: [0,1]     D=eeeeeeER. .   ld1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,2]     D==eeeeeeER .   ld1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,3]     D===eeeeeeER.   ld1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,4]     D====eeeeeeER   ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,2]     .D=eeeeeeER .   ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,3]     .D==eeeeeeER.   ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,4]     . D==eeeeeeER   ld1	{ v1.1d, v2.1d }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -901,10 +901,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], #32
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [5] Code Region - G06
 
@@ -913,7 +913,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               0.98
 # CHECK-NEXT: Block RThroughput: 3.3
@@ -924,9 +924,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.2d, v2.2d }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eeeeeeER. .   ld1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeeeER .   ld1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     D===eeeeeeER.   ld1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,4]     D====eeeeeeER   ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,2]     .D=eeeeeeER .   ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D==eeeeeeER.   ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,4]     . D==eeeeeeER   ld1	{ v1.8b, v2.8b }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -937,10 +937,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [6] Code Region - G07
 
@@ -949,7 +949,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1800
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.54
 # CHECK-NEXT: IPC:               0.98
 # CHECK-NEXT: Block RThroughput: 4.3
@@ -960,9 +960,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.8h, v2.8h }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eeeeeeER. .   ld1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeeeER .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,3]     D===eeeeeeER.   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,4]     .D===eeeeeeER   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,2]     .D=eeeeeeER .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,3]     . D=eeeeeeER.   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,4]     .  D=eeeeeeER   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -973,10 +973,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT:        1     1.8    0.2    0.0       <total>
 
 # CHECK:      [7] Code Region - G08
 
@@ -985,7 +985,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.94
 # CHECK-NEXT: IPC:               0.98
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -995,10 +995,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,1]     D=eeeeeeER. .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,2]     D==eeeeeeER .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,3]     D===eeeeeeER.   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,4]     .D===eeeeeeER   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1]     .DeeeeeeER. .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,2]     . DeeeeeeER .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3]     .  DeeeeeeER.   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,4]     .   DeeeeeeER   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1008,11 +1008,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [8] Code Region - G09
 
@@ -1021,7 +1021,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.94
 # CHECK-NEXT: IPC:               0.98
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -1031,10 +1031,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeeeER. .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeeeER .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     D===eeeeeeER.   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,4]     .D===eeeeeeER   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeeeER. .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,2]     . DeeeeeeER .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeeeeeeER.   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,4]     .   DeeeeeeER   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1044,11 +1044,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [9] Code Region - G10
 
@@ -1057,7 +1057,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      608
 # CHECK-NEXT: Total uOps:        2200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.62
 # CHECK-NEXT: IPC:               0.82
 # CHECK-NEXT: Block RThroughput: 5.7
@@ -1067,10 +1067,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeER .  .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeeeER.  .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeeeER  .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .D==eeeeeeeER.   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,4]     .D===eeeeeeeER   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,1]     .DeeeeeeER.  .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,2]     . DeeeeeeER  .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeeeeeeeER.   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,4]     .   DeeeeeeeER   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1080,11 +1080,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT:        1     2.6    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [10] Code Region - G11
 
@@ -1093,7 +1093,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      675
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.70
 # CHECK-NEXT: IPC:               0.74
 # CHECK-NEXT: Block RThroughput: 6.7
@@ -1103,10 +1103,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeeER.   .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,1]     D=eeeeeeeER   .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,2]     D==eeeeeeeER  .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,3]     .D===eeeeeeeER.   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,4]     .D====eeeeeeeER   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,1]     .DeeeeeeeER   .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,2]     . DeeeeeeeER  .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3]     .  D=eeeeeeeER.   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,4]     .   D=eeeeeeeER   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1116,11 +1116,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 3.     1     4.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT:        1     3.0    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 3.     1     2.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT:        1     1.4    0.4    0.0       <total>
 
 # CHECK:      [11] Code Region - G12
 
@@ -1129,7 +1129,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      675
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.70
 # CHECK-NEXT: IPC:               0.74
 # CHECK-NEXT: Block RThroughput: 6.7
@@ -1139,10 +1139,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeeER.   .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,1]     D=eeeeeeeER   .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeeeeER  .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D===eeeeeeeER.   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,4]     .D====eeeeeeeER   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeeeeER   .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,2]     . DeeeeeeeER  .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D=eeeeeeeER.   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,4]     .   D=eeeeeeeER   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1152,11 +1152,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT:        1     3.0    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT:        1     1.4    0.4    0.0       <total>
 
 # CHECK:      [12] Code Region - G13
 
@@ -1165,7 +1165,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      1210
 # CHECK-NEXT: Total uOps:        2300
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    1.90
 # CHECK-NEXT: IPC:               0.41
 # CHECK-NEXT: Block RThroughput: 5.7
@@ -1175,10 +1175,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789          01
 
 # CHECK:      [0,0]     DeeeeeeeER.    .    ..   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeeeeER    .    ..   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeeeeER   .    ..   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D===eeeeeeeER .    ..   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,4]     .D==========eeeeeeeeER   ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,1]     .DeeeeeeeER    .    ..   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,2]     . DeeeeeeeER   .    ..   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D=eeeeeeeER .    ..   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,4]     .   D=======eeeeeeeeER   ld1	{ v1.b }[0], [x27], #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1188,11 +1188,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT:        1     4.2    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT:        1     2.6    0.4    0.0       <total>
 
 # CHECK:      [13] Code Region - G14
 
@@ -1201,10 +1201,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.37
 # CHECK-NEXT: IPC:               0.12
-# CHECK-NEXT: Block RThroughput: 1.7
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
@@ -1212,9 +1212,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld1	{ v1.b }[8], [x27], #1
 # CHECK-NEXT: [0,1]     D========eeeeeeeeER .    .    .    .    . .   ld1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,2]     D================eeeeeeeeER   .    .    . .   ld1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,3]     D========================eeeeeeeeER.    . .   ld1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,4]     D================================eeeeeeeeER   ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,2]     .D===============eeeeeeeeER   .    .    . .   ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .D=======================eeeeeeeeER.    . .   ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,4]     . D==============================eeeeeeeeER   ld1	{ v1.h }[4], [x27], #2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1225,10 +1225,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.b }[8], [x27], #1
 # CHECK-NEXT: 1.     1     9.0    0.0    0.0       ld1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: 2.     1     17.0   0.0    0.0       ld1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: 3.     1     25.0   0.0    0.0       ld1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: 4.     1     33.0   0.0    0.0       ld1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT:        1     17.0   0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     16.0   0.0    0.0       ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     24.0   0.0    0.0       ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 4.     1     31.0   0.0    0.0       ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT:        1     16.2   0.2    0.0       <total>
 
 # CHECK:      [14] Code Region - G15
 
@@ -1237,10 +1237,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.37
 # CHECK-NEXT: IPC:               0.12
-# CHECK-NEXT: Block RThroughput: 1.7
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
@@ -1248,9 +1248,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld1	{ v1.h }[0], [x27], x28
 # CHECK-NEXT: [0,1]     D========eeeeeeeeER .    .    .    .    . .   ld1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,2]     D================eeeeeeeeER   .    .    . .   ld1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,3]     D========================eeeeeeeeER.    . .   ld1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,4]     D================================eeeeeeeeER   ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,2]     .D===============eeeeeeeeER   .    .    . .   ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,3]     .D=======================eeeeeeeeER.    . .   ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,4]     . D==============================eeeeeeeeER   ld1	{ v1.d }[0], [x27], #8
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1261,10 +1261,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.h }[0], [x27], x28
 # CHECK-NEXT: 1.     1     9.0    0.0    0.0       ld1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: 2.     1     17.0   0.0    0.0       ld1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: 3.     1     25.0   0.0    0.0       ld1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: 4.     1     33.0   0.0    0.0       ld1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT:        1     17.0   0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     16.0   0.0    0.0       ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: 3.     1     24.0   0.0    0.0       ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 4.     1     31.0   0.0    0.0       ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT:        1     16.2   0.2    0.0       <total>
 
 # CHECK:      [15] Code Region - G16
 
@@ -1273,10 +1273,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    1.25
 # CHECK-NEXT: IPC:               0.42
-# CHECK-NEXT: Block RThroughput: 1.7
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -1284,9 +1284,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld1	{ v1.d }[0], [x27], x28
 # CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld1r	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,2]     D==eeeeeeeeER .   ld1r	{ v1.2d }, [x27], #8
-# CHECK-NEXT: [0,3]     D===eeeeeeeeER.   ld1r	{ v1.2s }, [x27], #4
-# CHECK-NEXT: [0,4]     D====eeeeeeeeER   ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: [0,2]     .D=eeeeeeeeER .   ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: [0,3]     .D==eeeeeeeeER.   ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: [0,4]     . D==eeeeeeeeER   ld1r	{ v1.4h }, [x27], #2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1297,10 +1297,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.d }[0], [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1r	{ v1.1d }, [x27], #8
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1r	{ v1.2d }, [x27], #8
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1r	{ v1.2s }, [x27], #4
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ld1r	{ v1.4h }, [x27], #2
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [16] Code Region - G17
 
@@ -1309,10 +1309,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.94
 # CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 1.7
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -1320,9 +1320,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld1r	{ v1.4s }, [x27], #4
 # CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld1r	{ v1.8b }, [x27], #1
-# CHECK-NEXT: [0,2]     D==eeeeeeeeER .   ld1r	{ v1.8h }, [x27], #2
-# CHECK-NEXT: [0,3]     D===eeeeeeeeER.   ld1r	{ v1.16b }, [x27], #1
-# CHECK-NEXT: [0,4]     D====eeeeeeeeER   ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,2]     .D=eeeeeeeeER .   ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: [0,3]     .D==eeeeeeeeER.   ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: [0,4]     . D==eeeeeeeeER   ld1r	{ v1.1d }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1333,10 +1333,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.4s }, [x27], #4
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1r	{ v1.8b }, [x27], #1
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1r	{ v1.8h }, [x27], #2
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1r	{ v1.16b }, [x27], #1
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ld1r	{ v1.1d }, [x27], x28
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [17] Code Region - G18
 
@@ -1345,10 +1345,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.94
 # CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 1.7
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -1356,9 +1356,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld1r	{ v1.2d }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld1r	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeeeeeER .   ld1r	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     D===eeeeeeeeER.   ld1r	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,4]     D====eeeeeeeeER   ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,2]     .D=eeeeeeeeER .   ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D==eeeeeeeeER.   ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,4]     . D==eeeeeeeeER   ld1r	{ v1.8b }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1369,10 +1369,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.2d }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1r	{ v1.2s }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1r	{ v1.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1r	{ v1.4s }, [x27], x28
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ld1r	{ v1.8b }, [x27], x28
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [18] Code Region - G19
 
@@ -1381,10 +1381,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        1900
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.73
 # CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 2.0
+# CHECK-NEXT: Block RThroughput: 3.2
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -1392,9 +1392,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld1r	{ v1.8h }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld1r	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeeeeeER .   ld2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,3]     D===eeeeeeeeER.   ld2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,4]     .D===eeeeeeeeER   ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,2]     .D=eeeeeeeeER .   ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,3]     . D=eeeeeeeeER.   ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,4]     .  D=eeeeeeeeER   ld2	{ v1.4h, v2.4h }, [x27], #16
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1405,10 +1405,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.8h }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1r	{ v1.16b }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT:        1     1.8    0.2    0.0       <total>
 
 # CHECK:      [19] Code Region - G20
 
@@ -1417,20 +1417,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.71
 # CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,2]     D==eeeeeeeeER .   ld2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,3]     .D==eeeeeeeeER.   ld2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,4]     .D===eeeeeeeeER   ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeeeeeER  .   ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER .   ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,3]     .  DeeeeeeeeER.   ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER   ld2	{ v1.2d, v2.2d }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1440,11 +1440,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT:        1     2.6    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [20] Code Region - G21
 
@@ -1453,20 +1453,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.31
 # CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.7
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeeeeeER .   ld2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D==eeeeeeeeER.   ld2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,4]     .D===eeeeeeeeER   ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeeeeeER  .   ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER .   ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeeeeeeeeER.   ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER   ld2	{ v1.8h, v2.8h }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1476,11 +1476,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT:        1     2.6    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [21] Code Region - G22
 
@@ -1489,20 +1489,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      3310
 # CHECK-NEXT: Total uOps:        2100
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.63
 # CHECK-NEXT: IPC:               0.15
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D========eeeeeeeeER .    .    .    .    . .   ld2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,2]     D================eeeeeeeeER   .    .    . .   ld2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,3]     .D=======================eeeeeeeeER.    . .   ld2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,4]     .D===============================eeeeeeeeER   ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     .D=======eeeeeeeeER .    .    .    .    . .   ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,2]     . D==============eeeeeeeeER   .    .    . .   ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,3]     .  D=====================eeeeeeeeER.    . .   ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,4]     .   D============================eeeeeeeeER   ld2	{ v1.b, v2.b }[8], [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1512,11 +1512,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 2.     1     17.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: 3.     1     24.0   0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: 4.     1     32.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT:        1     16.6   0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 2.     1     15.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 3.     1     22.0   0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 4.     1     29.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT:        1     15.0   0.2    0.0       <total>
 
 # CHECK:      [22] Code Region - G23
 
@@ -1525,20 +1525,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.50
 # CHECK-NEXT: IPC:               0.12
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,1]     D========eeeeeeeeER .    .    .    .    . .   ld2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,2]     D================eeeeeeeeER   .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,3]     D========================eeeeeeeeER.    . .   ld2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,4]     .D===============================eeeeeeeeER   ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,1]     .D=======eeeeeeeeER .    .    .    .    . .   ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,2]     . D==============eeeeeeeeER   .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  D=====================eeeeeeeeER.    . .   ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,4]     .   D============================eeeeeeeeER   ld2	{ v1.s, v2.s }[0], [x27], #8
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1548,11 +1548,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 2.     1     17.0   0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: 3.     1     25.0   0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: 4.     1     32.0   0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT:        1     16.8   0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 2.     1     15.0   0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 3.     1     22.0   0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 4.     1     29.0   0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT:        1     15.0   0.2    0.0       <total>
 
 # CHECK:      [23] Code Region - G24
 
@@ -1561,20 +1561,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      2603
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.77
 # CHECK-NEXT: IPC:               0.19
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
 # CHECK-NEXT: Index     0123456789          012345678
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .  .   ld2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D========eeeeeeeeER .    .  .   ld2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,2]     D================eeeeeeeeER .   ld2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,3]     D=================eeeeeeeeER.   ld2r	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,4]     .D=================eeeeeeeeER   ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: [0,1]     .D=======eeeeeeeeER .    .  .   ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,2]     . D==============eeeeeeeeER .   ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  D==============eeeeeeeeER.   ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,4]     .   D==============eeeeeeeeER   ld2r	{ v1.2d, v2.2d }, [x27], #16
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1584,11 +1584,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 2.     1     17.0   0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: 3.     1     18.0   0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 4.     1     18.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT:        1     12.6   0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 2.     1     15.0   0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT:        1     10.8   0.2    0.0       <total>
 
 # CHECK:      [24] Code Region - G25
 
@@ -1597,20 +1597,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.92
 # CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld2r	{ v1.2s, v2.2s }, [x27], #8
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld2r	{ v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: [0,2]     D==eeeeeeeeER .   ld2r	{ v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: [0,3]     D===eeeeeeeeER.   ld2r	{ v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: [0,4]     .D===eeeeeeeeER   ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: [0,1]     .DeeeeeeeeER  .   ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER .   ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,3]     .  DeeeeeeeeER.   ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER   ld2r	{ v1.8h, v2.8h }, [x27], #4
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1620,11 +1620,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], #8
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [25] Code Region - G26
 
@@ -1633,20 +1633,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.92
 # CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld2r	{ v1.16b, v2.16b }, [x27], #2
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld2r	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeeeeeER .   ld2r	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     D===eeeeeeeeER.   ld2r	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,4]     .D===eeeeeeeeER   ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeeeeeER  .   ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER .   ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeeeeeeeeER.   ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER   ld2r	{ v1.4h, v2.4h }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1656,11 +1656,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], #2
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [26] Code Region - G27
 
@@ -1669,20 +1669,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2300
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.51
 # CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 2.8
+# CHECK-NEXT: Block RThroughput: 3.8
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld2r	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld2r	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeeeeeER .   ld2r	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     D===eeeeeeeeER.   ld2r	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,4]     .D===eeeeeeeeER   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,1]     .DeeeeeeeeER  .   ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER .   ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeeeeeeeeER.   ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1692,33 +1692,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [27] Code Region - G28
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      510
+# CHECK-NEXT: Total Cycles:      709
 # CHECK-NEXT: Total uOps:        3200
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    6.27
-# CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.51
+# CHECK-NEXT: IPC:               0.71
+# CHECK-NEXT: Block RThroughput: 5.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER   .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,2]     .D=eeeeeeeeER .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,3]     .D==eeeeeeeeER.   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,4]     . D==eeeeeeeeER   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK:      [0,0]     DeeeeeeeeER    .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1]     .DeeeeeeeeER   .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3]     .   DeeeeeeeeER.   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,4]     .    DeeeeeeeeER   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1728,33 +1728,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT:        1     1.0    0.4    0.0       <total>
 
 # CHECK:      [28] Code Region - G29
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      510
+# CHECK-NEXT: Total Cycles:      809
 # CHECK-NEXT: Total uOps:        3300
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    6.47
-# CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 4.3
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.08
+# CHECK-NEXT: IPC:               0.62
+# CHECK-NEXT: Block RThroughput: 5.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     0123456
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER   .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,2]     .D=eeeeeeeeER .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D==eeeeeeeeER.   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,4]     . D==eeeeeeeeER   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK:      [0,0]     DeeeeeeeeER    ..   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1]     . DeeeeeeeeER  ..   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,2]     .   DeeeeeeeeER..   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeeeeeeeER.   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,4]     .    .DeeeeeeeeER   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1764,33 +1764,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT:        1     1.0    0.6    0.0       <total>
 
 # CHECK:      [29] Code Region - G30
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      1910
+# CHECK-NEXT: Total Cycles:      1911
 # CHECK-NEXT: Total uOps:        3200
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    1.68
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    1.67
 # CHECK-NEXT: IPC:               0.26
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 5.3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .  .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER   .    .    .  .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,2]     .D=eeeeeeeeER  .    .    .  .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=========eeeeeeeeER    .  .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,4]     . D================eeeeeeeeER   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .   .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeeeeeER   .    .    .   .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,2]     .  DeeeeeeeeER .    .    .   .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    D======eeeeeeeeER   .   .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,4]     .    .D=============eeeeeeeeER   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1800,11 +1800,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     10.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 4.     1     17.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT:        1     6.4    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     7.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT:        1     4.8    0.4    0.0       <total>
 
 # CHECK:      [30] Code Region - G31
 
@@ -1813,20 +1813,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.75
 # CHECK-NEXT: IPC:               0.12
-# CHECK-NEXT: Block RThroughput: 3.8
+# CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D========eeeeeeeeER .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,2]     .D===============eeeeeeeeER   .    .    . .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,3]     .D=======================eeeeeeeeER.    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,4]     . D==============================eeeeeeeeER   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .D=======eeeeeeeeER .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,2]     . D==============eeeeeeeeER   .    .    . .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,3]     .  D=====================eeeeeeeeER.    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,4]     .   D============================eeeeeeeeER   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1836,11 +1836,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: 2.     1     16.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 3.     1     24.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 4.     1     31.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT:        1     16.2   0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 2.     1     15.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 3.     1     22.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 4.     1     29.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT:        1     15.0   0.2    0.0       <total>
 
 # CHECK:      [31] Code Region - G32
 
@@ -1849,20 +1849,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.75
 # CHECK-NEXT: IPC:               0.12
-# CHECK-NEXT: Block RThroughput: 3.8
+# CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,1]     D========eeeeeeeeER .    .    .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,2]     .D===============eeeeeeeeER   .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .D=======================eeeeeeeeER.    . .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,4]     . D==============================eeeeeeeeER   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .D=======eeeeeeeeER .    .    .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,2]     . D==============eeeeeeeeER   .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  D=====================eeeeeeeeER.    . .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,4]     .   D============================eeeeeeeeER   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1872,33 +1872,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: 2.     1     16.0   0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 3.     1     24.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 4.     1     31.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT:        1     16.2   0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 2.     1     15.0   0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     22.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 4.     1     29.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT:        1     15.0   0.2    0.0       <total>
 
 # CHECK:      [32] Code Region - G33
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      510
+# CHECK-NEXT: Total Cycles:      709
 # CHECK-NEXT: Total uOps:        3200
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    6.27
-# CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.51
+# CHECK-NEXT: IPC:               0.71
+# CHECK-NEXT: Block RThroughput: 5.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER   .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: [0,2]     .D=eeeeeeeeER .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: [0,3]     .D==eeeeeeeeER.   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: [0,4]     . D==eeeeeeeeER   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK:      [0,0]     DeeeeeeeeER    .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     .DeeeeeeeeER   .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,2]     .  DeeeeeeeeER .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,3]     .   DeeeeeeeeER.   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,4]     .    DeeeeeeeeER   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1908,33 +1908,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT:        1     1.0    0.4    0.0       <total>
 
 # CHECK:      [33] Code Region - G34
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      510
+# CHECK-NEXT: Total Cycles:      809
 # CHECK-NEXT: Total uOps:        3300
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    6.47
-# CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 4.3
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.08
+# CHECK-NEXT: IPC:               0.62
+# CHECK-NEXT: Block RThroughput: 5.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     0123456
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER   .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: [0,2]     .D=eeeeeeeeER .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: [0,3]     .D==eeeeeeeeER.   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,4]     . D==eeeeeeeeER   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK:      [0,0]     DeeeeeeeeER    ..   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: [0,1]     .DeeeeeeeeER   ..   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,2]     .  DeeeeeeeeER ..   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,3]     .    DeeeeeeeeER.   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,4]     .    .DeeeeeeeeER   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1944,33 +1944,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT:        1     1.0    0.6    0.0       <total>
 
 # CHECK:      [34] Code Region - G35
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      510
+# CHECK-NEXT: Total Cycles:      709
 # CHECK-NEXT: Total uOps:        3200
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    6.27
-# CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.51
+# CHECK-NEXT: IPC:               0.71
+# CHECK-NEXT: Block RThroughput: 5.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER   .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,2]     .D=eeeeeeeeER .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D==eeeeeeeeER.   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,4]     . D==eeeeeeeeER   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK:      [0,0]     DeeeeeeeeER    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeeeeeER   .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .   DeeeeeeeeER.   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,4]     .    DeeeeeeeeER   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1980,33 +1980,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT:        1     1.0    0.4    0.0       <total>
 
 # CHECK:      [35] Code Region - G36
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      710
+# CHECK-NEXT: Total Cycles:      1010
 # CHECK-NEXT: Total uOps:        4500
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    6.34
-# CHECK-NEXT: IPC:               0.70
-# CHECK-NEXT: Block RThroughput: 7.0
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.46
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     0123456789
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    ..   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeeeeeeeeeER  ..   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,2]     . DeeeeeeeeER  ..   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,3]     . D=eeeeeeeeER ..   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,4]     .  D==eeeeeeeeeER   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK:      [0,0]     DeeeeeeeeER    .   .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeeeeeeeeER .   .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,2]     .   DeeeeeeeeER.   .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,3]     .    .DeeeeeeeeER  .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,4]     .    .  DeeeeeeeeeER   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2016,33 +2016,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 4.     1     3.0    1.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT:        1     1.6    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT:        1     1.0    1.0    0.0       <total>
 
 # CHECK:      [36] Code Region - G37
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      810
+# CHECK-NEXT: Total Cycles:      1009
 # CHECK-NEXT: Total uOps:        4900
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    6.05
-# CHECK-NEXT: IPC:               0.62
-# CHECK-NEXT: Block RThroughput: 8.0
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.86
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 8.2
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234567
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    . .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,1]     .DeeeeeeeeeER  . .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,2]     . DeeeeeeeeeER . .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,3]     .  DeeeeeeeeeER. .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,4]     .   D===eeeeeeeeER   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,1]     . DeeeeeeeeeER .  .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,2]     .   DeeeeeeeeeER  .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,3]     .    .DeeeeeeeeeER.   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,4]     .    .  DeeeeeeeeER   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2052,33 +2052,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 4.     1     4.0    3.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT:        1     1.6    0.8    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT:        1     1.0    1.0    0.0       <total>
 
 # CHECK:      [37] Code Region - G38
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      809
+# CHECK-NEXT: Total Cycles:      1010
 # CHECK-NEXT: Total uOps:        4900
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    6.06
-# CHECK-NEXT: IPC:               0.62
-# CHECK-NEXT: Block RThroughput: 8.0
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.85
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 8.2
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     0123456789
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    ..   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeeeeeeeeeER  ..   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,2]     . DeeeeeeeeER  ..   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     .  DeeeeeeeeeER..   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,4]     .   D=eeeeeeeeeER   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK:      [0,0]     DeeeeeeeeER    .   .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeeeeeeeeER .   .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,2]     .   DeeeeeeeeER.   .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .DeeeeeeeeeER .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,4]     .    .  DeeeeeeeeeER   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2088,11 +2088,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT:        1     1.2    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT:        1     1.0    1.0    0.0       <total>
 
 # CHECK:      [38] Code Region - G39
 
@@ -2101,20 +2101,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        4000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    1.00
 # CHECK-NEXT: IPC:               0.12
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,1]     D========eeeeeeeeER .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,2]     .D===============eeeeeeeeER   .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .D=======================eeeeeeeeER.    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,4]     . D==============================eeeeeeeeER   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,1]     . D======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,2]     .   D============eeeeeeeeER   .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .D==================eeeeeeeeER.    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,4]     .    .  D========================eeeeeeeeER   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2124,11 +2124,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 2.     1     16.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: 3.     1     24.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 4.     1     31.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT:        1     16.2   0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
 
 # CHECK:      [39] Code Region - G40
 
@@ -2137,20 +2137,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        4000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    1.00
 # CHECK-NEXT: IPC:               0.12
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,1]     D========eeeeeeeeER .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,2]     .D===============eeeeeeeeER   .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,3]     .D=======================eeeeeeeeER.    . .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,4]     . D==============================eeeeeeeeER   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     . D======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,2]     .   D============eeeeeeeeER   .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,3]     .    .D==================eeeeeeeeER.    . .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,4]     .    .  D========================eeeeeeeeER   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2160,11 +2160,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 2.     1     16.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: 3.     1     24.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 4.     1     31.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT:        1     16.2   0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
 
 # CHECK:      [40] Code Region - G41
 
@@ -2173,20 +2173,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      1903
 # CHECK-NEXT: Total uOps:        4100
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.15
 # CHECK-NEXT: IPC:               0.26
-# CHECK-NEXT: Block RThroughput: 5.3
+# CHECK-NEXT: Block RThroughput: 6.8
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
 # CHECK-NEXT: Index     0123456789          01
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    ..   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,1]     D========eeeeeeeeER ..   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,2]     .D========eeeeeeeeER..   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,3]     . D========eeeeeeeeER.   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: [0,4]     .  D========eeeeeeeeER   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: [0,1]     . D======eeeeeeeeER ..   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,2]     .   D=====eeeeeeeeER..   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3]     .    .D====eeeeeeeeER.   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,4]     .    .  D===eeeeeeeeER   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2196,33 +2196,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 3.     1     9.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT:        1     7.4    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT:        1     4.6    0.2    0.0       <total>
 
 # CHECK:      [41] Code Region - G42
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      659
+# CHECK-NEXT: Total Cycles:      1009
 # CHECK-NEXT: Total uOps:        4300
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    6.53
-# CHECK-NEXT: IPC:               0.76
-# CHECK-NEXT: Block RThroughput: 6.0
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.26
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 7.2
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: [0,1]     .DeeeeeeeeER   .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: [0,3]     .  DeeeeeeeeER .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: [0,4]     .   D=eeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: [0,1]     . DeeeeeeeeER  .  .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,2]     .   DeeeeeeeeER.  .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,3]     .    .DeeeeeeeeER .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,4]     .    .  DeeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2232,33 +2232,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT:        1     1.2    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT:        1     1.0    1.0    0.0       <total>
 
 # CHECK:      [42] Code Region - G43
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      610
+# CHECK-NEXT: Total Cycles:      1009
 # CHECK-NEXT: Total uOps:        4200
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    6.89
-# CHECK-NEXT: IPC:               0.82
-# CHECK-NEXT: Block RThroughput: 5.7
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.16
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 7.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeeeeeeeeER   .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     . D=eeeeeeeeER .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,4]     .  D==eeeeeeeeER   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeeeeeeeER  .  .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,2]     .   DeeeeeeeeER.  .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .DeeeeeeeeER .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,4]     .    .  DeeeeeeeeER   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2268,33 +2268,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 4.     1     3.0    1.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT:        1     1.6    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT:        1     1.0    1.0    0.0       <total>
 
 # CHECK:      [43] Code Region - G44
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      508
+# CHECK-NEXT: Total Cycles:      808
 # CHECK-NEXT: Total uOps:        3400
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    6.69
-# CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 4.3
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.21
+# CHECK-NEXT: IPC:               0.62
+# CHECK-NEXT: Block RThroughput: 5.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeeeeeeeeER.   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,2]     . DeeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     . D=eeeeeeE-R   ldp	s1, s2, [x27], #248
-# CHECK-NEXT: [0,4]     .  D=eeeeeeER   ldp	d1, d2, [x27], #496
+# CHECK:      [0,0]     DeeeeeeeeER    .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeeeeeeeER  .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,2]     .   DeeeeeeeeER.   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .DeeeeeeER.   ldp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,4]     .    . DeeeeeeER   ldp	d1, d2, [x27], #496
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2304,11 +2304,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    1.0       ldp	s1, s2, [x27], #248
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldp	d1, d2, [x27], #496
-# CHECK-NEXT:        1     1.4    0.2    0.2       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       ldp	s1, s2, [x27], #248
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ldp	d1, d2, [x27], #496
+# CHECK-NEXT:        1     1.0    0.8    0.0       <total>
 
 # CHECK:      [44] Code Region - G45
 
@@ -2317,20 +2317,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        2300
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.54
 # CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 2.3
+# CHECK-NEXT: Block RThroughput: 3.8
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeER ..   ldp	q1, q2, [x27], #992
-# CHECK-NEXT: [0,1]     D=eeeeeeER..   ldp	s1, s2, [x27, #248]!
-# CHECK-NEXT: [0,2]     D==eeeeeeER.   ldp	d1, d2, [x27, #496]!
-# CHECK-NEXT: [0,3]     .D==eeeeeeER   ldp	q1, q2, [x27, #992]!
-# CHECK-NEXT: [0,4]     .D===eeeeE-R   ldp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,1]     .DeeeeeeER..   ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,2]     . DeeeeeeER.   ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,3]     .  DeeeeeeER   ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,4]     .   DeeeeE-R   ldp	w1, w2, [x27], #248
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2340,11 +2340,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	q1, q2, [x27], #992
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ldp	s1, s2, [x27, #248]!
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ldp	d1, d2, [x27, #496]!
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ldp	q1, q2, [x27, #992]!
-# CHECK-NEXT: 4.     1     4.0    0.0    1.0       ldp	w1, w2, [x27], #248
-# CHECK-NEXT:        1     2.6    0.2    0.2       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 4.     1     1.0    0.0    1.0       ldp	w1, w2, [x27], #248
+# CHECK-NEXT:        1     1.0    0.2    0.2       <total>
 
 # CHECK:      [45] Code Region - G46
 
@@ -2353,10 +2353,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        2100
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.14
 # CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 3.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01
@@ -2364,9 +2364,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeER   ..   ldp	x1, x2, [x27], #496
 # CHECK-NEXT: [0,1]     D=eeeeER  ..   ldp	w1, w2, [x27, #248]!
-# CHECK-NEXT: [0,2]     D==eeeeER ..   ldp	x1, x2, [x27, #496]!
-# CHECK-NEXT: [0,3]     D===eeeeeER.   ldpsw	x1, x2, [x27], #248
-# CHECK-NEXT: [0,4]     .D===eeeeeER   ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: [0,2]     .D=eeeeER ..   ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,3]     . D=eeeeeER.   ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: [0,4]     .  D=eeeeeER   ldpsw	x1, x2, [x27, #248]!
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2377,10 +2377,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	x1, x2, [x27], #496
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ldp	w1, w2, [x27, #248]!
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ldp	x1, x2, [x27, #496]!
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ldpsw	x1, x2, [x27], #248
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ldpsw	x1, x2, [x27, #248]!
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT:        1     1.8    0.2    0.0       <total>
 
 # CHECK:      [46] Code Region - G47
 
@@ -2389,10 +2389,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 1.7
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -2400,9 +2400,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeER . .   ldr	b1, [x27], #254
 # CHECK-NEXT: [0,1]     D=eeeeeeER. .   ldr	h1, [x27], #254
-# CHECK-NEXT: [0,2]     D==eeeeeeER .   ldr	s1, [x27], #254
-# CHECK-NEXT: [0,3]     D===eeeeeeER.   ldr	d1, [x27], #254
-# CHECK-NEXT: [0,4]     D====eeeeeeER   ldr	q1, [x27], #254
+# CHECK-NEXT: [0,2]     .D=eeeeeeER .   ldr	s1, [x27], #254
+# CHECK-NEXT: [0,3]     .D==eeeeeeER.   ldr	d1, [x27], #254
+# CHECK-NEXT: [0,4]     . D==eeeeeeER   ldr	q1, [x27], #254
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2413,10 +2413,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27], #254
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ldr	h1, [x27], #254
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ldr	s1, [x27], #254
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ldr	d1, [x27], #254
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ldr	q1, [x27], #254
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldr	s1, [x27], #254
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ldr	d1, [x27], #254
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldr	q1, [x27], #254
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [47] Code Region - G48
 
@@ -2425,10 +2425,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 1.7
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -2436,9 +2436,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeER . .   ldr	b1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eeeeeeER. .   ldr	h1, [x27, #254]!
-# CHECK-NEXT: [0,2]     D==eeeeeeER .   ldr	s1, [x27, #254]!
-# CHECK-NEXT: [0,3]     D===eeeeeeER.   ldr	d1, [x27, #254]!
-# CHECK-NEXT: [0,4]     D====eeeeeeER   ldr	q1, [x27, #254]!
+# CHECK-NEXT: [0,2]     .D=eeeeeeER .   ldr	s1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .D==eeeeeeER.   ldr	d1, [x27, #254]!
+# CHECK-NEXT: [0,4]     . D==eeeeeeER   ldr	q1, [x27, #254]!
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2449,10 +2449,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ldr	h1, [x27, #254]!
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ldr	s1, [x27, #254]!
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ldr	d1, [x27, #254]!
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ldr	q1, [x27, #254]!
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldr	s1, [x27, #254]!
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ldr	d1, [x27, #254]!
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldr	q1, [x27, #254]!
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [48] Code Region - G49
 
@@ -2461,7 +2461,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        1000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    1.98
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 1.7
@@ -2473,8 +2473,8 @@ ldr  x2, [x1], #254
 # CHECK:      [0,0]     DeeeeER   .   ldr	w1, [x27], #254
 # CHECK-NEXT: [0,1]     D=eeeeER  .   ldr	x1, [x27], #254
 # CHECK-NEXT: [0,2]     D==eeeeER .   ldr	w1, [x27, #254]!
-# CHECK-NEXT: [0,3]     D===eeeeER.   ldr	x1, [x27, #254]!
-# CHECK-NEXT: [0,4]     D====eeeeER   ldrb	w1, [x27], #254
+# CHECK-NEXT: [0,3]     .D==eeeeER.   ldr	x1, [x27, #254]!
+# CHECK-NEXT: [0,4]     .D===eeeeER   ldrb	w1, [x27], #254
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2486,9 +2486,9 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	w1, [x27], #254
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ldr	x1, [x27], #254
 # CHECK-NEXT: 2.     1     3.0    0.0    0.0       ldr	w1, [x27, #254]!
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ldr	x1, [x27, #254]!
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ldrb	w1, [x27], #254
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ldr	x1, [x27, #254]!
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ldrb	w1, [x27], #254
+# CHECK-NEXT:        1     2.6    0.2    0.0       <total>
 
 # CHECK:      [49] Code Region - G50
 
@@ -2497,7 +2497,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        1000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    1.98
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 1.7
@@ -2509,8 +2509,8 @@ ldr  x2, [x1], #254
 # CHECK:      [0,0]     DeeeeER   .   ldrb	w1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eeeeER  .   ldrh	w1, [x27], #254
 # CHECK-NEXT: [0,2]     D==eeeeER .   ldrh	w1, [x27, #254]!
-# CHECK-NEXT: [0,3]     D===eeeeER.   ldrsb	w1, [x27], #254
-# CHECK-NEXT: [0,4]     D====eeeeER   ldrsb	x1, [x27], #254
+# CHECK-NEXT: [0,3]     .D==eeeeER.   ldrsb	w1, [x27], #254
+# CHECK-NEXT: [0,4]     .D===eeeeER   ldrsb	x1, [x27], #254
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2522,9 +2522,9 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrb	w1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ldrh	w1, [x27], #254
 # CHECK-NEXT: 2.     1     3.0    0.0    0.0       ldrh	w1, [x27, #254]!
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ldrsb	w1, [x27], #254
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ldrsb	x1, [x27], #254
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ldrsb	w1, [x27], #254
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ldrsb	x1, [x27], #254
+# CHECK-NEXT:        1     2.6    0.2    0.0       <total>
 
 # CHECK:      [50] Code Region - G51
 
@@ -2533,7 +2533,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        1000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    1.98
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 1.7
@@ -2545,8 +2545,8 @@ ldr  x2, [x1], #254
 # CHECK:      [0,0]     DeeeeER   .   ldrsb	w1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eeeeER  .   ldrsb	x1, [x27, #254]!
 # CHECK-NEXT: [0,2]     D==eeeeER .   ldrsh	w1, [x27], #254
-# CHECK-NEXT: [0,3]     D===eeeeER.   ldrsh	x1, [x27], #254
-# CHECK-NEXT: [0,4]     D====eeeeER   ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .D==eeeeER.   ldrsh	x1, [x27], #254
+# CHECK-NEXT: [0,4]     .D===eeeeER   ldrsh	w1, [x27, #254]!
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2558,9 +2558,9 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrsb	w1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ldrsb	x1, [x27, #254]!
 # CHECK-NEXT: 2.     1     3.0    0.0    0.0       ldrsh	w1, [x27], #254
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ldrsh	x1, [x27], #254
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ldrsh	w1, [x27, #254]!
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ldrsh	x1, [x27], #254
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ldrsh	w1, [x27, #254]!
+# CHECK-NEXT:        1     2.6    0.2    0.0       <total>
 
 # CHECK:      [51] Code Region - G52
 
@@ -2569,10 +2569,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        1200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.38
 # CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 1.0
+# CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
@@ -2580,8 +2580,8 @@ ldr  x2, [x1], #254
 # CHECK:      [0,0]     DeeeeER .   ldrsh	x1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eeeeER.   ldrsw	x1, [x27], #254
 # CHECK-NEXT: [0,2]     D==eeeeER   ldrsw	x1, [x27, #254]!
-# CHECK-NEXT: [0,3]     D===eeE-R   st1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,4]     D====eeER   st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,3]     .D==eeE-R   st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,4]     .D===eeER   st1	{ v1.2d }, [x27], #16
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2593,9 +2593,9 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrsh	x1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ldrsw	x1, [x27], #254
 # CHECK-NEXT: 2.     1     3.0    0.0    0.0       ldrsw	x1, [x27, #254]!
-# CHECK-NEXT: 3.     1     4.0    0.0    1.0       st1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.2d }, [x27], #16
-# CHECK-NEXT:        1     3.0    0.2    0.2       <total>
+# CHECK-NEXT: 3.     1     3.0    0.0    1.0       st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT:        1     2.6    0.2    0.2       <total>
 
 # CHECK:      [52] Code Region - G53
 
@@ -2604,7 +2604,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.98
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -2614,9 +2614,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.2s }, [x27], #8
 # CHECK-NEXT: [0,1]     D=eeER  .   st1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,2]     D==eeER .   st1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: [0,3]     D===eeER.   st1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: [0,4]     D====eeER   st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,2]     .D=eeER .   st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,3]     .D==eeER.   st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,4]     . D==eeER   st1	{ v1.8h }, [x27], #16
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2627,10 +2627,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s }, [x27], #8
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [53] Code Region - G54
 
@@ -2639,7 +2639,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.98
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -2649,9 +2649,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.16b }, [x27], #16
 # CHECK-NEXT: [0,1]     D=eeER  .   st1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeER .   st1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     D===eeER.   st1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,4]     D====eeER   st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,2]     .D=eeER .   st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D==eeER.   st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,4]     . D==eeER   st1	{ v1.4h }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2662,10 +2662,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b }, [x27], #16
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [54] Code Region - G55
 
@@ -2674,7 +2674,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.98
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -2684,9 +2684,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.4s }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eeER  .   st1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeER .   st1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     D===eeER.   st1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,4]     D====eeER   st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,2]     .D=eeER .   st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D==eeER.   st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,4]     . D==eeER   st1	{ v1.1d, v2.1d }, [x27], #16
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2697,10 +2697,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [55] Code Region - G56
 
@@ -2709,7 +2709,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        1900
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.77
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 3.5
@@ -2718,10 +2718,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,1]     D=eeER  .   st1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,2]     D==eeER .   st1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,3]     D===eeER.   st1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,4]     .D===eeER   st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,1]     .DeeER  .   st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,2]     .D=eeER .   st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3]     . D=eeER.   st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,4]     .  D=eeER   st1	{ v1.8b, v2.8b }, [x27], #16
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2731,11 +2731,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT:        1     1.6    0.2    0.0       <total>
 
 # CHECK:      [56] Code Region - G57
 
@@ -2744,7 +2744,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2100
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.17
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 4.0
@@ -2753,10 +2753,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,1]     D=eeER  .   st1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,2]     D==eeER .   st1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D==eeER.   st1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,4]     .D===eeER   st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeER  .   st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,2]     . DeeER .   st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeeER.   st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,4]     .   DeeER   st1	{ v1.2s, v2.2s }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2766,11 +2766,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT:        1     2.6    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [57] Code Region - G58
 
@@ -2779,7 +2779,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2100
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.17
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 4.0
@@ -2788,10 +2788,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeER  .   st1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeER .   st1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     D===eeER.   st1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,4]     .D===eeER   st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeER  .   st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,2]     . DeeER .   st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeeER.   st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,4]     .   DeeER   st1	{ v1.16b, v2.16b }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2801,11 +2801,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [58] Code Region - G59
 
@@ -2814,7 +2814,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        2900
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.13
 # CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 6.0
@@ -2823,10 +2823,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeER.   .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1]     D=eeER   .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,2]     .D=eeER  .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,3]     .D===eeER.   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,4]     . D===eeER   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1]     .DeeER   .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,2]     . DeeER  .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,3]     .  D=eeER.   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,4]     .   D=eeER   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2836,32 +2836,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 3.     1     4.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT:        1     2.6    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 3.     1     2.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT:        1     1.4    0.4    0.0       <total>
 
 # CHECK:      [59] Code Region - G60
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      703
+# CHECK-NEXT: Total Cycles:      704
 # CHECK-NEXT: Total uOps:        3100
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    4.41
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.40
 # CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 6.5
 
 # CHECK:      Timeline view:
+# CHECK-NEXT:                     0
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.   .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,1]     D=eeER   .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,2]     .D=eeER  .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,3]     .D===eeER.   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,4]     . D===eeER   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK:      [0,0]     DeeER.    .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1]     .DeeER    .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,2]     .  DeeER  .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,3]     .   DeeER .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,4]     .    D=eeER   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2871,11 +2872,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 3.     1     4.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT:        1     2.6    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT:        1     1.2    0.6    0.0       <total>
 
 # CHECK:      [60] Code Region - G61
 
@@ -2884,7 +2885,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        2900
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.13
 # CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 6.0
@@ -2893,10 +2894,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeER.   .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeER   .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,2]     .D=eeER  .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D==eeER .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,4]     . D===eeER   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeER   .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,2]     . DeeER  .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeeER .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,4]     .   D=eeER   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2906,11 +2907,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 4.     1     4.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT:        1     2.4    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT:        1     1.2    0.4    0.0       <total>
 
 # CHECK:      [61] Code Region - G62
 
@@ -2919,7 +2920,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      704
 # CHECK-NEXT: Total uOps:        3100
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.40
 # CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 6.5
@@ -2929,10 +2930,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeER.    .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeER    .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,2]     .D==eeER  .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,3]     .D===eeER .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,4]     . D====eeER   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeeER    .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,2]     . D=eeER  .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3]     .   DeeER .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,4]     .    D=eeER   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2942,33 +2943,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 2.     1     3.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 4.     1     5.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT:        1     3.0    0.6    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 2.     1     2.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT:        1     1.4    0.6    0.0       <total>
 
 # CHECK:      [62] Code Region - G63
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      804
+# CHECK-NEXT: Total Cycles:      805
 # CHECK-NEXT: Total uOps:        3700
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.60
 # CHECK-NEXT: IPC:               0.62
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
+# CHECK-NEXT:                     012
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.    ..   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,1]     D=eeER    ..   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,2]     .D==eeER  ..   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,3]     . D==eeER ..   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,4]     . D=====eeER   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1]     . DeeER   . .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,2]     .  D=eeER . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3]     .    DeeER. .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,4]     .    . D=eeER   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2978,32 +2979,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 2.     1     3.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 4.     1     6.0    2.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT:        1     3.0    0.8    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 2.     1     2.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 4.     1     2.0    2.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT:        1     1.4    1.0    0.0       <total>
 
 # CHECK:      [63] Code Region - G64
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      703
+# CHECK-NEXT: Total Cycles:      704
 # CHECK-NEXT: Total uOps:        3300
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.69
 # CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 7.0
 
 # CHECK:      Timeline view:
+# CHECK-NEXT:                     0
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.   .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeER   .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,2]     .D==eeER .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D===eeER.   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,4]     . D===eeER   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK:      [0,0]     DeeER.    .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeER   .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,2]     .  D=eeER .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .   D=eeER.   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,4]     .    .DeeER   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3013,33 +3015,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT:        1     2.8    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 2.     1     2.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT:        1     1.4    0.6    0.0       <total>
 
 # CHECK:      [64] Code Region - G65
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      706
+# CHECK-NEXT: Total Cycles:      707
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    4.25
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.24
 # CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 7.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
+# CHECK-NEXT:                     0123
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeeER    . .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,2]     .D===eeeeER .   st1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,3]     . D===eeeeER.   st1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: [0,4]     . D====eeeeER   st1	{ v1.b }[0], [x27], x28
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeER   .  .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,2]     .   DeeeeER  .   st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,3]     .    D=eeeeER.   st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,4]     .    .D=eeeeER   st1	{ v1.b }[0], [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3049,11 +3051,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 2.     1     4.0    2.0    0.0       st1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT:        1     3.0    0.6    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 3.     1     2.0    1.0    0.0       st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT:        1     1.4    0.8    0.0       <total>
 
 # CHECK:      [65] Code Region - G66
 
@@ -3062,7 +3064,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.95
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -3072,10 +3074,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeER   .   st1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeER  .   st1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,2]     D==eeeeER .   st1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,3]     D===eeeeER.   st1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: [0,4]     .D===eeeeER   st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeER  .   st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,2]     . DeeeeER .   st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,3]     .  DeeeeER.   st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,4]     .   DeeeeER   st1	{ v1.h }[4], [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3085,11 +3087,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [66] Code Region - G67
 
@@ -3098,7 +3100,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      605
 # CHECK-NEXT: Total uOps:        2300
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.80
 # CHECK-NEXT: IPC:               0.83
 # CHECK-NEXT: Block RThroughput: 6.0
@@ -3108,10 +3110,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeER   .   st1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,1]     D=eeeeER  .   st1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeER .   st1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,3]     D===eeeeER.   st1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: [0,4]     .D===eeeeER   st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeeeeER  .   st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,2]     . DeeeeER .   st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .  DeeeeER.   st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,4]     .   DeeeeER   st2	{ v1.2d, v2.2d }, [x27], #32
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3121,11 +3123,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [67] Code Region - G68
 
@@ -3134,7 +3136,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      705
 # CHECK-NEXT: Total uOps:        2600
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.69
 # CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 7.0
@@ -3144,10 +3146,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeER   ..   st2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,1]     D=eeeeER  ..   st2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,2]     D==eeeeER ..   st2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,3]     .D==eeeeER..   st2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,4]     .D====eeeeER   st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeeeeER  ..   st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,2]     . DeeeeER ..   st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,3]     .  DeeeeER..   st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,4]     .   D=eeeeER   st2	{ v1.8h, v2.8h }, [x27], #32
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3157,33 +3159,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 4.     1     5.0    1.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT:        1     2.8    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT:        1     1.2    0.4    0.0       <total>
 
 # CHECK:      [68] Code Region - G69
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      805
+# CHECK-NEXT: Total Cycles:      806
 # CHECK-NEXT: Total uOps:        2900
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.60
 # CHECK-NEXT: IPC:               0.62
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
+# CHECK-NEXT:                     0123
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   . .   st2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,1]     D=eeeeER  . .   st2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,2]     .D===eeeeER .   st2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D====eeeeER.   st2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,4]     .D=====eeeeER   st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK:      [0,0]     DeeeeER   .  .   st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,1]     . DeeeeER .  .   st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,2]     .  D=eeeeER  .   st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .   D==eeeeER.   st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,4]     .    D==eeeeER   st2	{ v1.4s, v2.4s }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3193,33 +3195,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 2.     1     4.0    2.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     5.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 4.     1     6.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT:        1     3.6    0.6    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 2.     1     2.0    1.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    1.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT:        1     2.0    0.8    0.0       <total>
 
 # CHECK:      [69] Code Region - G70
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      706
+# CHECK-NEXT: Total Cycles:      707
 # CHECK-NEXT: Total uOps:        2600
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.68
 # CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 7.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
+# CHECK-NEXT:                     0123
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   . .   st2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeER  . .   st2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,2]     .D=eeeeER . .   st2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .D====eeeeER.   st2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,4]     .D=====eeeeER   st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK:      [0,0]     DeeeeER   .  .   st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeER  .  .   st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,2]     .  DeeeeER.  .   st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .   D=eeeeER .   st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,4]     .    D==eeeeER   st2	{ v1.b, v2.b }[8], [x27], #2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3229,11 +3231,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     5.0    2.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 4.     1     6.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT:        1     3.2    0.6    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    1.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 4.     1     3.0    1.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT:        1     1.6    0.8    0.0       <total>
 
 # CHECK:      [70] Code Region - G71
 
@@ -3242,7 +3244,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.95
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -3252,10 +3254,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeER   .   st2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeER  .   st2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeER .   st2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,3]     D===eeeeER.   st2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,4]     .D===eeeeER   st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeER  .   st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,2]     . DeeeeER .   st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,3]     .  DeeeeER.   st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,4]     .   DeeeeER   st2	{ v1.h, v2.h }[0], [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3265,11 +3267,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [71] Code Region - G72
 
@@ -3278,7 +3280,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.95
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -3288,10 +3290,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeER   .   st2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeER  .   st2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,2]     D==eeeeER .   st2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,3]     D===eeeeER.   st2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,4]     .D===eeeeER   st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeER  .   st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,2]     . DeeeeER .   st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  DeeeeER.   st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,4]     .   DeeeeER   st2	{ v1.d, v2.d }[0], [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3301,11 +3303,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [72] Code Region - G73
 
@@ -3314,7 +3316,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      807
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.72
 # CHECK-NEXT: IPC:               0.62
 # CHECK-NEXT: Block RThroughput: 7.0
@@ -3325,9 +3327,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeER .    .   .   st2g	x26, [x27], #4064
 # CHECK-NEXT: [0,1]     D=eER.    .   .   st2g	x26, [x27, #4064]!
-# CHECK-NEXT: [0,2]     D==eeeeeeER   .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,3]     .D==eeeeeER   .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,4]     .D======eeeeeER   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,2]     .D=eeeeeeER   .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3]     .  DeeeeeER   .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,4]     .    D==eeeeeER   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3338,32 +3340,32 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2g	x26, [x27], #4064
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       st2g	x26, [x27, #4064]!
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 4.     1     7.0    3.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT:        1     3.2    0.8    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 4.     1     3.0    3.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT:        1     1.8    0.8    0.0       <total>
 
 # CHECK:      [73] Code Region - G74
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      1405
+# CHECK-NEXT: Total Cycles:      1406
 # CHECK-NEXT: Total uOps:        4700
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    3.35
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    3.34
 # CHECK-NEXT: IPC:               0.36
 # CHECK-NEXT: Block RThroughput: 14.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeeeeER .    .   .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,1]     .DeeeeeER .    .   .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,2]     . D===eeeeeeER .   .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,3]     .  D===eeeeeeER.   .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,4]     .   D=======eeeeeeER   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK:      [0,0]     DeeeeeeER .    .    .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1]     . DeeeeeER.    .    .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,2]     .   D==eeeeeeER.    .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3]     .    .D=eeeeeeER    .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,4]     .    .  D====eeeeeeER   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3373,33 +3375,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 2.     1     4.0    3.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 4.     1     8.0    4.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT:        1     3.6    1.6    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 2.     1     3.0    3.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 4.     1     5.0    4.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT:        1     2.4    1.8    0.0       <total>
 
 # CHECK:      [74] Code Region - G75
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      1206
+# CHECK-NEXT: Total Cycles:      1207
 # CHECK-NEXT: Total uOps:        4100
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.40
 # CHECK-NEXT: IPC:               0.41
 # CHECK-NEXT: Block RThroughput: 12.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234567
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    . .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeeER .    . .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,2]     .D===eeeeeeER  . .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     . D===eeeeeER  . .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,4]     .  D======eeeeeeER   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK:      [0,0]     DeeeeeER  .    .  .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeeeeER.    .  .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,2]     .   DeeeeeeER  .  .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .DeeeeeER .  .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,4]     .    .  D==eeeeeeER   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3409,33 +3411,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 2.     1     4.0    2.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 4.     1     7.0    3.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT:        1     3.6    1.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 4.     1     3.0    3.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT:        1     1.4    1.4    0.0       <total>
 
 # CHECK:      [75] Code Region - G76
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      1106
+# CHECK-NEXT: Total Cycles:      1107
 # CHECK-NEXT: Total uOps:        3800
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    3.44
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    3.43
 # CHECK-NEXT: IPC:               0.45
 # CHECK-NEXT: Block RThroughput: 11.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01234567
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeeeeeER .    ..   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,2]     .D====eeeeeER  ..   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,3]     . D====eeeeeER ..   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,4]     . D=======eeeeeER   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK:      [0,0]     DeeeeeeER .    . .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeeeeER.    . .   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,2]     .   D==eeeeeER . .   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,3]     .    .D=eeeeeER. .   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,4]     .    .  D==eeeeeER   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3445,20 +3447,20 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 2.     1     5.0    3.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: 3.     1     5.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 4.     1     8.0    2.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT:        1     4.0    1.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 2.     1     3.0    3.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 4.     1     3.0    2.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT:        1     2.0    1.4    0.0       <total>
 
 # CHECK:      [76] Code Region - G77
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      1005
+# CHECK-NEXT: Total Cycles:      1006
 # CHECK-NEXT: Total uOps:        3500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.48
 # CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 10.0
@@ -3468,10 +3470,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeER  .    .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,1]     D=eeeeeER .    .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,2]     .D===eeeeeER   .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .D====eeeeeER  .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,4]     . D======eeeeeER   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,1]     . DeeeeeER.    .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,2]     .   DeeeeeER   .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .DeeeeeER .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,4]     .    .  DeeeeeER   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3481,33 +3483,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 2.     1     4.0    2.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: 3.     1     5.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 4.     1     7.0    2.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT:        1     3.8    1.0    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT:        1     1.0    1.0    0.0       <total>
 
 # CHECK:      [77] Code Region - G78
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      1304
+# CHECK-NEXT: Total Cycles:      1305
 # CHECK-NEXT: Total uOps:        4300
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.30
 # CHECK-NEXT: IPC:               0.38
 # CHECK-NEXT: Block RThroughput: 13.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01234567
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    ..   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeeER .    ..   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,2]     .D===eeeeeER   ..   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,3]     . D===eeeeeER  ..   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,4]     .  D=====eeeeeeER   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK:      [0,0]     DeeeeeER  .    . .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     . DeeeeeER.    . .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,2]     .   DeeeeeER   . .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .DeeeeeER . .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,4]     .    .   DeeeeeeER   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3517,33 +3519,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 2.     1     4.0    2.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 4.     1     6.0    2.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT:        1     3.4    1.0    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT:        1     1.0    1.0    0.0       <total>
 
 # CHECK:      [78] Code Region - G79
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      2399
+# CHECK-NEXT: Total Cycles:      2400
 # CHECK-NEXT: Total uOps:        6900
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.88
 # CHECK-NEXT: IPC:               0.21
 # CHECK-NEXT: Block RThroughput: 24.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123
 
-# CHECK:      [0,0]     DeeeeeeER .    .    . .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,1]     .DeeeeeeeER    .    . .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,2]     . D====eeeeeeER.    . .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,3]     .  D=========eeeeeeeER.   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,4]     .    D========eeeeeeeER   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK:      [0,0]     DeeeeeeER .    .    .  .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1]     . DeeeeeeeER   .    .  .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,2]     .    D=eeeeeeER.    .  .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3]     .    . D=====eeeeeeeER .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,4]     .    .    D====eeeeeeeER   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3553,33 +3555,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 2.     1     5.0    4.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 3.     1     10.0   5.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT:        1     5.2    2.0    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 2.     1     2.0    2.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 3.     1     6.0    5.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 4.     1     5.0    1.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT:        1     3.0    2.0    0.0       <total>
 
 # CHECK:      [79] Code Region - G80
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      1903
+# CHECK-NEXT: Total Cycles:      1904
 # CHECK-NEXT: Total uOps:        5700
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    3.00
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    2.99
 # CHECK-NEXT: IPC:               0.26
 # CHECK-NEXT: Block RThroughput: 19.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01
+# CHECK-NEXT: Index     0123456789          012
 
-# CHECK:      [0,0]     DeeeeeER  .    .    ..   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeeeeeeER.    .    ..   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,2]     . D=====eeeeeeER    ..   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .  D=====eeeeeeeER  ..   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,4]     .   D=========eeeeeeER   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK:      [0,0]     DeeeeeER  .    .    . .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DeeeeeeER   .    . .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,2]     .    D===eeeeeeER   . .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . D==eeeeeeeER . .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,4]     .    .    D====eeeeeeER   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3589,33 +3591,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 2.     1     6.0    5.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     6.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 4.     1     10.0   4.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT:        1     4.8    2.0    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 2.     1     4.0    4.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 4.     1     5.0    4.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT:        1     2.8    2.0    0.0       <total>
 
 # CHECK:      [80] Code Region - G81
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      1658
+# CHECK-NEXT: Total Cycles:      1659
 # CHECK-NEXT: Total uOps:        4900
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               0.30
 # CHECK-NEXT: Block RThroughput: 16.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123
+# CHECK-NEXT: Index     0123456789          01234
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .  .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     . DeeeeeeeER   .    .  .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,2]     .  D=========eeeeeeER  .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,3]     .  D===========eeeeeeER.   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,4]     .  D============eeeeeeER   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK:      [0,0]     DeeeeeeeER.    .    .   .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DeeeeeeeER  .    .   .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,2]     .    .D======eeeeeeER   .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,3]     .    . D========eeeeeeER.   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,4]     .    .  D========eeeeeeER   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3626,10 +3628,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
 # CHECK-NEXT: 1.     1     1.0    1.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 2.     1     10.0   9.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 3.     1     12.0   1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT:        1     7.4    2.4    0.0       <total>
+# CHECK-NEXT: 2.     1     7.0    7.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 3.     1     9.0    2.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT:        1     5.4    2.2    0.0       <total>
 
 # CHECK:      [81] Code Region - G82
 
@@ -3638,7 +3640,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      757
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.30
 # CHECK-NEXT: IPC:               0.66
 # CHECK-NEXT: Block RThroughput: 7.5
@@ -3648,10 +3650,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeER .   .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeeeER.   .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,2]     D===eeeeeeER  .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,3]     .D===eeeeeeER .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,4]     .D=====eeeeeeER   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeeeER.   .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,2]     . D=eeeeeeER  .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,3]     .  D=eeeeeeER .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,4]     .   D==eeeeeeER   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3661,11 +3663,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: 2.     1     4.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 4.     1     6.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT:        1     3.4    0.6    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 2.     1     2.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 4.     1     3.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT:        1     1.8    0.6    0.0       <total>
 
 # CHECK:      [82] Code Region - G83
 
@@ -3674,7 +3676,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      704
 # CHECK-NEXT: Total uOps:        2700
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.84
 # CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 7.0
@@ -3684,10 +3686,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeER .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,1]     D=eeeeeeER.   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,2]     .D==eeeeER.   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,3]     .D===eeeeER   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,4]     . D===eE--R   stg	x26, [x27], #4064
+# CHECK-NEXT: [0,1]     .DeeeeeeER.   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,2]     . D=eeeeER.   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,3]     .   DeeeeER   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,4]     .    DeE--R   stg	x26, [x27], #4064
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3697,11 +3699,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: 2.     1     3.0    1.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 4.     1     4.0    0.0    2.0       stg	x26, [x27], #4064
-# CHECK-NEXT:        1     2.8    0.4    0.4       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 2.     1     2.0    1.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    2.0       stg	x26, [x27], #4064
+# CHECK-NEXT:        1     1.2    0.4    0.4       <total>
 
 # CHECK:      [83] Code Region - G84
 
@@ -3710,19 +3712,19 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        1700
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.37
 # CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 2.8
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeER .  .   stg	x26, [x27, #4064]!
 # CHECK-NEXT: [0,1]     D=eER.  .   stgp	x1, x2, [x27], #992
-# CHECK-NEXT: [0,2]     D==eER  .   stgp	x1, x2, [x27, #992]!
-# CHECK-NEXT: [0,3]     D===eeER.   stp	s1, s2, [x27], #248
-# CHECK-NEXT: [0,4]     .D===eeER   stp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,2]     .D=eER  .   stgp	x1, x2, [x27, #992]!
+# CHECK-NEXT: [0,3]     . D=eeER.   stp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,4]     .  D=eeER   stp	d1, d2, [x27], #496
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3733,10 +3735,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       stg	x26, [x27, #4064]!
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       stgp	x1, x2, [x27], #992
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       stgp	x1, x2, [x27, #992]!
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       stp	s1, s2, [x27], #248
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       stp	d1, d2, [x27], #496
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stgp	x1, x2, [x27, #992]!
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       stp	s1, s2, [x27], #248
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       stp	d1, d2, [x27], #496
+# CHECK-NEXT:        1     1.8    0.2    0.0       <total>
 
 # CHECK:      [84] Code Region - G85
 
@@ -3745,19 +3747,19 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.84
 # CHECK-NEXT: IPC:               0.71
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeER.   .   stp	q1, q2, [x27], #992
-# CHECK-NEXT: [0,1]     D==eeER  .   stp	s1, s2, [x27, #248]!
-# CHECK-NEXT: [0,2]     D===eeER .   stp	d1, d2, [x27, #496]!
-# CHECK-NEXT: [0,3]     .D===eeER.   stp	q1, q2, [x27, #992]!
-# CHECK-NEXT: [0,4]     .D=====eER   stp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,1]     .D=eeER  .   stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,2]     . D=eeER .   stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,3]     .  D=eeER.   stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,4]     .   D==eER   stp	w1, w2, [x27], #248
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3767,11 +3769,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	q1, q2, [x27], #992
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       stp	s1, s2, [x27, #248]!
-# CHECK-NEXT: 2.     1     4.0    0.0    0.0       stp	d1, d2, [x27, #496]!
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       stp	q1, q2, [x27, #992]!
-# CHECK-NEXT: 4.     1     6.0    0.0    0.0       stp	w1, w2, [x27], #248
-# CHECK-NEXT:        1     3.6    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       stp	w1, w2, [x27], #248
+# CHECK-NEXT:        1     2.0    0.2    0.0       <total>
 
 # CHECK:      [85] Code Region - G86
 
@@ -3780,19 +3782,19 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        1700
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.37
 # CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 2.8
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeER .  .   stp	x1, x2, [x27], #496
 # CHECK-NEXT: [0,1]     D=eER.  .   stp	w1, w2, [x27, #248]!
-# CHECK-NEXT: [0,2]     D==eER  .   stp	x1, x2, [x27, #496]!
-# CHECK-NEXT: [0,3]     D===eeER.   str	b1, [x27], #254
-# CHECK-NEXT: [0,4]     .D===eeER   str	h1, [x27], #254
+# CHECK-NEXT: [0,2]     .D=eER  .   stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,3]     . D=eeER.   str	b1, [x27], #254
+# CHECK-NEXT: [0,4]     .  D=eeER   str	h1, [x27], #254
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3803,10 +3805,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	x1, x2, [x27], #496
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       stp	w1, w2, [x27, #248]!
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       stp	x1, x2, [x27, #496]!
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       str	b1, [x27], #254
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       str	h1, [x27], #254
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       str	b1, [x27], #254
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       str	h1, [x27], #254
+# CHECK-NEXT:        1     1.8    0.2    0.0       <total>
 
 # CHECK:      [86] Code Region - G87
 
@@ -3815,19 +3817,19 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   str	s1, [x27], #254
-# CHECK-NEXT: [0,1]     D=eeER  .   str	d1, [x27], #254
-# CHECK-NEXT: [0,2]     D==eeER .   str	q1, [x27], #254
-# CHECK-NEXT: [0,3]     D===eeER.   str	b1, [x27, #254]!
-# CHECK-NEXT: [0,4]     .D===eeER   str	h1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeeER  .   str	d1, [x27], #254
+# CHECK-NEXT: [0,2]     . DeeER .   str	q1, [x27], #254
+# CHECK-NEXT: [0,3]     .  DeeER.   str	b1, [x27, #254]!
+# CHECK-NEXT: [0,4]     .   DeeER   str	h1, [x27, #254]!
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3837,11 +3839,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	s1, [x27], #254
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       str	d1, [x27], #254
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       str	q1, [x27], #254
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       str	b1, [x27, #254]!
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       str	h1, [x27, #254]!
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       str	d1, [x27], #254
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       str	q1, [x27], #254
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       str	b1, [x27, #254]!
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       str	h1, [x27, #254]!
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [87] Code Region - G88
 
@@ -3850,19 +3852,19 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      503
 # CHECK-NEXT: Total uOps:        1800
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.58
 # CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     01234567
 
 # CHECK:      [0,0]     DeeER. .   str	s1, [x27, #254]!
-# CHECK-NEXT: [0,1]     D=eeER .   str	d1, [x27, #254]!
-# CHECK-NEXT: [0,2]     D==eeER.   str	q1, [x27, #254]!
-# CHECK-NEXT: [0,3]     D===eER.   str	w1, [x27], #254
-# CHECK-NEXT: [0,4]     .D===eER   str	x1, [x27], #254
+# CHECK-NEXT: [0,1]     .DeeER .   str	d1, [x27, #254]!
+# CHECK-NEXT: [0,2]     . DeeER.   str	q1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .  DeER.   str	w1, [x27], #254
+# CHECK-NEXT: [0,4]     .  D=eER   str	x1, [x27], #254
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3872,11 +3874,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	s1, [x27, #254]!
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       str	d1, [x27, #254]!
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       str	q1, [x27, #254]!
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       str	w1, [x27], #254
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       str	x1, [x27], #254
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       str	d1, [x27, #254]!
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       str	q1, [x27, #254]!
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       str	w1, [x27], #254
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       str	x1, [x27], #254
+# CHECK-NEXT:        1     1.2    0.2    0.0       <total>
 
 # CHECK:      [88] Code Region - G89
 
@@ -3885,7 +3887,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      503
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.98
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -3895,9 +3897,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeER . .   str	w1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eER. .   str	x1, [x27, #254]!
-# CHECK-NEXT: [0,2]     D==eER .   strb	w1, [x27], #254
-# CHECK-NEXT: [0,3]     D===eER.   strb	w1, [x27, #254]!
-# CHECK-NEXT: [0,4]     D====eER   strh	w1, [x27], #254
+# CHECK-NEXT: [0,2]     .D=eER .   strb	w1, [x27], #254
+# CHECK-NEXT: [0,3]     .D==eER.   strb	w1, [x27, #254]!
+# CHECK-NEXT: [0,4]     . D==eER   strh	w1, [x27], #254
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3908,10 +3910,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	w1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       str	x1, [x27, #254]!
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       strb	w1, [x27], #254
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       strb	w1, [x27, #254]!
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       strh	w1, [x27], #254
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       strb	w1, [x27], #254
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       strb	w1, [x27, #254]!
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       strh	w1, [x27], #254
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [89] Code Region - G90
 
@@ -3920,7 +3922,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      503
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.98
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -3930,9 +3932,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeER . .   strh	w1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eER. .   stz2g	x26, [x27], #4064
-# CHECK-NEXT: [0,2]     D==eER .   stz2g	x26, [x27, #4064]!
-# CHECK-NEXT: [0,3]     D===eER.   stzg	x26, [x27], #4064
-# CHECK-NEXT: [0,4]     D====eER   stzg	x26, [x27, #4064]!
+# CHECK-NEXT: [0,2]     .D=eER .   stz2g	x26, [x27, #4064]!
+# CHECK-NEXT: [0,3]     .D==eER.   stzg	x26, [x27], #4064
+# CHECK-NEXT: [0,4]     . D==eER   stzg	x26, [x27, #4064]!
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3943,10 +3945,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       strh	w1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       stz2g	x26, [x27], #4064
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       stz2g	x26, [x27, #4064]!
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       stzg	x26, [x27], #4064
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       stzg	x26, [x27, #4064]!
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stz2g	x26, [x27, #4064]!
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       stzg	x26, [x27], #4064
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       stzg	x26, [x27, #4064]!
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [90] Code Region - G91
 
@@ -3955,7 +3957,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      110
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.64
 # CHECK-NEXT: IPC:               1.82
 # CHECK-NEXT: Block RThroughput: 0.7
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-zero-lat-movs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-zero-lat-movs.s
index 1690d96..3ddb525 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-zero-lat-movs.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-zero-lat-movs.s
@@ -23,18 +23,18 @@ mov  x1, x2
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      0     0.06                        mov	x1, #0
-# CHECK-NEXT:  1      0     0.06                        mov	x1, xzr
-# CHECK-NEXT:  1      0     0.06                        mov	w1, #0
-# CHECK-NEXT:  1      0     0.06                        mov	w1, wzr
-# CHECK-NEXT:  1      0     0.06                        fmov	h1, wzr
-# CHECK-NEXT:  1      0     0.06                        fmov	h1, xzr
-# CHECK-NEXT:  1      0     0.06                        fmov	s1, wzr
-# CHECK-NEXT:  1      0     0.06                        fmov	d1, xzr
-# CHECK-NEXT:  1      0     0.06                        movi	d1, #0000000000000000
-# CHECK-NEXT:  1      0     0.06                        movi	v1.2d, #0000000000000000
-# CHECK-NEXT:  1      0     0.06                        mov	w1, w2
-# CHECK-NEXT:  1      0     0.06                        mov	x1, x2
+# CHECK-NEXT:  1      0     0.17                        mov	x1, #0
+# CHECK-NEXT:  1      0     0.17                        mov	x1, xzr
+# CHECK-NEXT:  1      0     0.17                        mov	w1, #0
+# CHECK-NEXT:  1      0     0.17                        mov	w1, wzr
+# CHECK-NEXT:  1      0     0.17                        fmov	h1, wzr
+# CHECK-NEXT:  1      0     0.17                        fmov	h1, xzr
+# CHECK-NEXT:  1      0     0.17                        fmov	s1, wzr
+# CHECK-NEXT:  1      0     0.17                        fmov	d1, xzr
+# CHECK-NEXT:  1      0     0.17                        movi	d1, #0000000000000000
+# CHECK-NEXT:  1      0     0.17                        movi	v1.2d, #0000000000000000
+# CHECK-NEXT:  1      0     0.17                        mov	w1, w2
+# CHECK-NEXT:  1      0     0.17                        mov	x1, x2
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - V2UnitB
diff --git a/llvm/tools/llvm-as/llvm-as.cpp b/llvm/tools/llvm-as/llvm-as.cpp
index 081bcb6..f42a08e 100644
--- a/llvm/tools/llvm-as/llvm-as.cpp
+++ b/llvm/tools/llvm-as/llvm-as.cpp
@@ -66,7 +66,6 @@ static cl::opt<std::string> ClDataLayout("data-layout",
                                          cl::desc("data layout string to use"),
                                          cl::value_desc("layout-string"),
                                          cl::init(""), cl::cat(AsCat));
-extern cl::opt<bool> UseNewDbgInfoFormat;
 
 static void WriteOutputFile(const Module *M, const ModuleSummaryIndex *Index) {
   // Infer the output filename if needed.
@@ -140,10 +139,8 @@ int main(int argc, char **argv) {
     return 1;
   }
 
-  // Convert to new debug format if requested.
-  M->setIsNewDbgInfoFormat(UseNewDbgInfoFormat);
-  if (M->IsNewDbgInfoFormat)
-    M->removeDebugIntrinsicDeclarations();
+  M->setIsNewDbgInfoFormat(true);
+  M->removeDebugIntrinsicDeclarations();
 
   std::unique_ptr<ModuleSummaryIndex> Index = std::move(ModuleAndIndex.Index);
 
diff --git a/llvm/tools/llvm-cov/SourceCoverageViewText.cpp b/llvm/tools/llvm-cov/SourceCoverageViewText.cpp
index 765f8bb..df8eb1d 100644
--- a/llvm/tools/llvm-cov/SourceCoverageViewText.cpp
+++ b/llvm/tools/llvm-cov/SourceCoverageViewText.cpp
@@ -186,8 +186,8 @@ void SourceCoverageViewText::renderLine(raw_ostream &OS, LineRef L,
     if (getOptions().Debug && Highlight)
       HighlightedRanges.push_back(std::make_pair(Col, End));
     Col = End;
-    if ((!S->IsGapRegion || (Highlight && *Highlight == raw_ostream::RED)) &&
-        S->HasCount && S->Count == 0)
+    if ((!S->IsGapRegion || Highlight == raw_ostream::RED) && S->HasCount &&
+        S->Count == 0)
       Highlight = raw_ostream::RED;
     else if (Col == ExpansionCol)
       Highlight = raw_ostream::CYAN;
diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp
index 8c1aaf6..8937272 100644
--- a/llvm/tools/llvm-dis/llvm-dis.cpp
+++ b/llvm/tools/llvm-dis/llvm-dis.cpp
@@ -96,8 +96,6 @@ static cl::opt<bool> PrintThinLTOIndexOnly(
     cl::desc("Only read thinlto index and print the index as LLVM assembly."),
     cl::init(false), cl::Hidden, cl::cat(DisCategory));
 
-extern cl::opt<bool> UseNewDbgInfoFormat;
-
 namespace {
 
 static void printDebugLoc(const DebugLoc &DL, formatted_raw_ostream &OS) {
@@ -270,9 +268,8 @@ int main(int argc, char **argv) {
       // All that llvm-dis does is write the assembly to a file.
       if (!DontPrint) {
         if (M) {
-          M->setIsNewDbgInfoFormat(UseNewDbgInfoFormat);
-          if (UseNewDbgInfoFormat)
-            M->removeDebugIntrinsicDeclarations();
+          M->setIsNewDbgInfoFormat(true);
+          M->removeDebugIntrinsicDeclarations();
           M->print(Out->os(), Annotator.get(), PreserveAssemblyUseListOrder);
         }
         if (Index)
diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp
index ac5b5f5..35b4f0a 100644
--- a/llvm/tools/llvm-link/llvm-link.cpp
+++ b/llvm/tools/llvm-link/llvm-link.cpp
@@ -129,8 +129,6 @@ static cl::opt<bool> IgnoreNonBitcode(
     cl::desc("Do not report an error for non-bitcode files in archives"),
     cl::Hidden);
 
-extern cl::opt<bool> UseNewDbgInfoFormat;
-
 static ExitOnError ExitOnErr;
 
 // Read the specified bitcode file in and return it. This routine searches the
@@ -531,10 +529,10 @@ int main(int argc, char **argv) {
       Composite->removeDebugIntrinsicDeclarations();
   };
   if (OutputAssembly) {
-    SetFormat(UseNewDbgInfoFormat);
+    SetFormat(true);
     Composite->print(Out.os(), nullptr, PreserveAssemblyUseListOrder);
   } else if (Force || !CheckBitcodeOutputToConsole(Out.os())) {
-    SetFormat(UseNewDbgInfoFormat);
+    SetFormat(true);
     WriteBitcodeToFile(*Composite, Out.os(), PreserveBitcodeUseListOrder);
   }
 
diff --git a/llvm/tools/llvm-objdump/SourcePrinter.cpp b/llvm/tools/llvm-objdump/SourcePrinter.cpp
index 33d494b..bf33637 100644
--- a/llvm/tools/llvm-objdump/SourcePrinter.cpp
+++ b/llvm/tools/llvm-objdump/SourcePrinter.cpp
@@ -45,7 +45,7 @@ void LiveVariable::print(raw_ostream &OS, const MCRegisterInfo &MRI) const {
     return {};
   };
 
-  Expression.printCompact(OS, GetRegName);
+  DWARFExpressionPrinter::printCompact(&Expression, OS, GetRegName);
 }
 
 void LiveVariablePrinter::addVariable(DWARFDie FuncDie, DWARFDie VarDie) {
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceGlobalObjects.cpp b/llvm/tools/llvm-reduce/deltas/ReduceGlobalObjects.cpp
index 64bf711..5d958dd 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceGlobalObjects.cpp
+++ b/llvm/tools/llvm-reduce/deltas/ReduceGlobalObjects.cpp
@@ -13,18 +13,26 @@ using namespace llvm;
 
 static bool shouldReduceSection(GlobalObject &GO) { return GO.hasSection(); }
 
-static bool shouldReduceAlign(GlobalObject &GO) {
-  return GO.getAlign().has_value();
+static bool shouldReduceAlign(GlobalVariable *GV) {
+  return GV->getAlign().has_value();
 }
 
+static bool shouldReduceAlign(Function *F) { return F->getAlign().has_value(); }
+
 static bool shouldReduceComdat(GlobalObject &GO) { return GO.hasComdat(); }
 
 void llvm::reduceGlobalObjectsDeltaPass(Oracle &O, ReducerWorkItem &Program) {
   for (auto &GO : Program.getModule().global_objects()) {
     if (shouldReduceSection(GO) && !O.shouldKeep())
       GO.setSection("");
-    if (shouldReduceAlign(GO) && !O.shouldKeep())
-      GO.setAlignment(MaybeAlign());
+    if (auto *GV = dyn_cast<GlobalVariable>(&GO)) {
+      if (shouldReduceAlign(GV) && !O.shouldKeep())
+        GV->setAlignment(MaybeAlign());
+    }
+    if (auto *F = dyn_cast<Function>(&GO)) {
+      if (shouldReduceAlign(F) && !O.shouldKeep())
+        F->setAlignment(MaybeAlign());
+    }
     if (shouldReduceComdat(GO) && !O.shouldKeep())
       GO.setComdat(nullptr);
   }
diff --git a/llvm/unittests/ADT/STLExtrasTest.cpp b/llvm/unittests/ADT/STLExtrasTest.cpp
index 0101be4..286cfa7 100644
--- a/llvm/unittests/ADT/STLExtrasTest.cpp
+++ b/llvm/unittests/ADT/STLExtrasTest.cpp
@@ -1567,6 +1567,30 @@ TEST(STLExtrasTest, Mismatch) {
   }
 }
 
+TEST(STLExtrasTest, Includes) {
+  {
+    std::vector<int> V1 = {1, 2};
+    std::vector<int> V2;
+    EXPECT_TRUE(includes(V1, V2));
+    EXPECT_FALSE(includes(V2, V1));
+    V2.push_back(1);
+    EXPECT_TRUE(includes(V1, V2));
+    V2.push_back(3);
+    EXPECT_FALSE(includes(V1, V2));
+  }
+
+  {
+    std::vector<int> V1 = {3, 2, 1};
+    std::vector<int> V2;
+    EXPECT_TRUE(includes(V1, V2, std::greater<>()));
+    EXPECT_FALSE(includes(V2, V1, std::greater<>()));
+    V2.push_back(3);
+    EXPECT_TRUE(includes(V1, V2, std::greater<>()));
+    V2.push_back(0);
+    EXPECT_FALSE(includes(V1, V2, std::greater<>()));
+  }
+}
+
 struct Foo;
 struct Bar {};
 
diff --git a/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp b/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp
index 9edb21f..03009d5 100644
--- a/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp
+++ b/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp
@@ -24,9 +24,6 @@
 using namespace llvm;
 using namespace IRSimilarity;
 
-LLVM_ABI extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
-LLVM_ABI extern cl::opt<bool> UseNewDbgInfoFormat;
-
 static std::unique_ptr<Module> makeLLVMModule(LLVMContext &Context,
                                               StringRef ModuleStr) {
   SMDiagnostic Err;
diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp
index e23005b..6031898 100644
--- a/llvm/unittests/Analysis/ValueTrackingTest.cpp
+++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp
@@ -187,6 +187,15 @@ TEST_F(MatchSelectPatternTest, FastFMin) {
       "  %A = select i1 %1, float %a, float 5.0\n"
       "  ret float %A\n"
       "}\n");
+  expectPattern({SPF_FMINNUM, SPNB_RETURNS_ANY, true});
+}
+
+TEST_F(MatchSelectPatternTest, FastFMinUnordered) {
+  parseAssembly("define float @test(float %a) {\n"
+                "  %1 = fcmp nnan ult float %a, 5.0\n"
+                "  %A = select i1 %1, float %a, float 5.0\n"
+                "  ret float %A\n"
+                "}\n");
   expectPattern({SPF_FMINNUM, SPNB_RETURNS_ANY, false});
 }
 
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index 67a0d94..1b590aa 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -567,7 +567,7 @@ struct VPMatchContext : public SDPatternMatch::BasicMatchContext {
       return OpVal->getOpcode() == Opc;
 
     auto BaseOpc = ISD::getBaseOpcodeForVP(OpVal->getOpcode(), false);
-    return BaseOpc.has_value() && *BaseOpc == Opc;
+    return BaseOpc == Opc;
   }
 
   unsigned getNumOperands(SDValue N) const {
diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFExpressionCompactPrinterTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFExpressionCompactPrinterTest.cpp
index 62f48b6..9225ab0 100644
--- a/llvm/unittests/DebugInfo/DWARF/DWARFExpressionCompactPrinterTest.cpp
+++ b/llvm/unittests/DebugInfo/DWARF/DWARFExpressionCompactPrinterTest.cpp
@@ -70,7 +70,7 @@ void DWARFExpressionCompactPrinterTest::TestExprPrinter(
     return {};
   };
 
-  Expr.printCompact(OS, GetRegName);
+  DWARFExpressionPrinter::printCompact(&Expr, OS, GetRegName);
   EXPECT_EQ(OS.str(), Expected);
 }
 
diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp
index c3a0f66..a888fd6 100644
--- a/llvm/unittests/IR/DebugInfoTest.cpp
+++ b/llvm/unittests/IR/DebugInfoTest.cpp
@@ -27,8 +27,6 @@
 
 using namespace llvm;
 
-LLVM_ABI extern cl::opt<bool> UseNewDbgInfoFormat;
-
 static std::unique_ptr<Module> parseIR(LLVMContext &C, const char *IR) {
   SMDiagnostic Err;
   std::unique_ptr<Module> Mod = parseAssemblyString(IR, Err, C);
@@ -241,8 +239,6 @@ TEST(DbgVariableIntrinsic, EmptyMDIsKillLocation) {
 // Duplicate of above test, but in DbgVariableRecord representation.
 TEST(MetadataTest, DeleteInstUsedByDbgVariableRecord) {
   LLVMContext C;
-  bool OldDbgValueMode = UseNewDbgInfoFormat;
-  UseNewDbgInfoFormat = true;
 
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
@@ -285,23 +281,19 @@ TEST(MetadataTest, DeleteInstUsedByDbgVariableRecord) {
   EXPECT_EQ(DVRs[1]->getNumVariableLocationOps(), 2u);
   EXPECT_TRUE(isa<UndefValue>(DVRs[1]->getVariableLocationOp(1)));
   EXPECT_TRUE(DVRs[1]->isKillLocation());
-  UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
 // Ensure that the order of dbg.value intrinsics returned by findDbgValues, and
 // their corresponding DbgVariableRecord representation, are consistent.
 TEST(MetadataTest, OrderingOfDbgVariableRecords) {
-  bool OldDbgValueMode = UseNewDbgInfoFormat;
-  UseNewDbgInfoFormat = false;
   LLVMContext C;
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
       %b = add i16 %a, 1, !dbg !11
-      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-      call void @llvm.dbg.value(metadata i16 %b, metadata !12, metadata !DIExpression()), !dbg !11
+        #dbg_value(i16 %b, !9, !DIExpression(), !11)
+        #dbg_value(i16 %b, !12, !DIExpression(), !11)
       ret i16 0, !dbg !11
     }
-    declare void @llvm.dbg.value(metadata, metadata, metadata) #0
     attributes #0 = { nounwind readnone speculatable willreturn }
 
     !llvm.dbg.cu = !{!0}
@@ -324,33 +316,20 @@ TEST(MetadataTest, OrderingOfDbgVariableRecords) {
 
   SmallVector<DbgValueInst *, 2> DVIs;
   SmallVector<DbgVariableRecord *, 2> DVRs;
-  findDbgValues(DVIs, &I, &DVRs);
-  ASSERT_EQ(DVIs.size(), 2u);
-  ASSERT_EQ(DVRs.size(), 0u);
-
-  // The correct order of dbg.values is given by their use-list, which becomes
-  // the reverse order of creation. Thus the dbg.values should come out as
-  // "bar" and then "foo".
-  DILocalVariable *Var0 = DVIs[0]->getVariable();
-  EXPECT_TRUE(Var0->getName() == "bar");
-  DILocalVariable *Var1 = DVIs[1]->getVariable();
-  EXPECT_TRUE(Var1->getName() == "foo");
 
-  // Now try again, but in DbgVariableRecord form.
-  DVIs.clear();
-
-  M->convertToNewDbgValues();
   findDbgValues(DVIs, &I, &DVRs);
   ASSERT_EQ(DVIs.size(), 0u);
   ASSERT_EQ(DVRs.size(), 2u);
 
-  Var0 = DVRs[0]->getVariable();
+  // The correct order of dbg.values is given by their use-list, which becomes
+  // the reverse order of creation. Thus the dbg.values should come out as
+  // "bar" and then "foo".
+  const DILocalVariable *Var0 = DVRs[0]->getVariable();
   EXPECT_TRUE(Var0->getName() == "bar");
-  Var1 = DVRs[1]->getVariable();
+  const DILocalVariable *Var1 = DVRs[1]->getVariable();
   EXPECT_TRUE(Var1->getName() == "foo");
 
   M->convertFromNewDbgValues();
-  UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
 TEST(DIBuilder, CreateFile) {
@@ -923,8 +902,6 @@ TEST(AssignmentTrackingTest, InstrMethods) {
 // dbg.values that have been converted to a non-instruction format.
 TEST(MetadataTest, ConvertDbgToDbgVariableRecord) {
   LLVMContext C;
-  bool OldDbgValueMode = UseNewDbgInfoFormat;
-  UseNewDbgInfoFormat = false;
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
       call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
@@ -954,6 +931,12 @@ TEST(MetadataTest, ConvertDbgToDbgVariableRecord) {
     !11 = !DILocation(line: 1, column: 1, scope: !6)
 )");
 
+  // The IR above will be autoupgraded to debug records: but this test is about
+  // the conversion routines, so convert it back. This test will have value
+  // going forwards for the purpose of testing the conversion routine, which
+  // some compatibility tools (DXIL?) wish to use.
+  M->convertFromNewDbgValues();
+
   // Find the first dbg.value,
   Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHIIt();
   const DILocalVariable *Var = nullptr;
@@ -1099,15 +1082,11 @@ TEST(MetadataTest, ConvertDbgToDbgVariableRecord) {
   // The record of those trailing DbgVariableRecords would dangle and cause an
   // assertion failure if it lived until the end of the LLVMContext.
   ExitBlock->deleteTrailingDbgRecords();
-  UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
 TEST(MetadataTest, DbgVariableRecordConversionRoutines) {
   LLVMContext C;
 
-  bool OldDbgValueMode = UseNewDbgInfoFormat;
-  UseNewDbgInfoFormat = false;
-
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
       call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
@@ -1137,13 +1116,13 @@ TEST(MetadataTest, DbgVariableRecordConversionRoutines) {
     !11 = !DILocation(line: 1, column: 1, scope: !6)
 )");
 
-  // For the purpose of this test, set and un-set the command line option
-  // corresponding to UseNewDbgInfoFormat, but only after parsing, to ensure
-  // that the IR starts off in the old format.
-  UseNewDbgInfoFormat = true;
+  // This test exists to check we can convert back and forth between old and new
+  // debug info formats (dbg.value intrinsics versus #dbg_value records).
+  // We're ripping out support for debug intrinsics, but the conversions will
+  // live on in bitcode autoupgrade and possibly DXIL autodowngrade. Thus, this
+  // test is still valuable. Begin by starting in the intrinsic format:
+  M->convertFromNewDbgValues();
 
-  // Check that the conversion routines and utilities between dbg.value
-  // debug-info format and DbgVariableRecords works.
   Function *F = M->getFunction("f");
   BasicBlock *BB1 = &F->getEntryBlock();
   // First instruction should be a dbg.value.
@@ -1245,8 +1224,6 @@ TEST(MetadataTest, DbgVariableRecordConversionRoutines) {
   EXPECT_EQ(DVI1->getExpression(), Expr1);
   EXPECT_EQ(DVI2->getVariable(), DLV2);
   EXPECT_EQ(DVI2->getExpression(), Expr2);
-
-  UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
 // Test that the hashing function for DISubprograms representing methods produce
diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp
index acf1692..126db4d 100644
--- a/llvm/unittests/IR/InstructionsTest.cpp
+++ b/llvm/unittests/IR/InstructionsTest.cpp
@@ -33,8 +33,6 @@
 #include "gtest/gtest.h"
 #include <memory>
 
-LLVM_ABI extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
-
 namespace llvm {
 namespace {
 
@@ -1452,8 +1450,6 @@ TEST(InstructionsTest, GetSplat) {
 
 TEST(InstructionsTest, SkipDebug) {
   LLVMContext C;
-  bool OldDbgValueMode = UseNewDbgInfoFormat;
-  UseNewDbgInfoFormat = false;
   std::unique_ptr<Module> M = parseIR(C,
                                       R"(
       declare void @llvm.dbg.value(metadata, metadata, metadata)
@@ -1480,6 +1476,8 @@ TEST(InstructionsTest, SkipDebug) {
   )");
   ASSERT_TRUE(M);
   Function *F = cast<Function>(M->getNamedValue("f"));
+  // This test wants to see dbg.values.
+  F->convertFromNewDbgValues();
   BasicBlock &BB = F->front();
 
   // The first non-debug instruction is the terminator.
@@ -1489,7 +1487,6 @@ TEST(InstructionsTest, SkipDebug) {
 
   // After the terminator, there are no non-debug instructions.
   EXPECT_EQ(nullptr, Term->getNextNonDebugInstruction());
-  UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
 TEST(InstructionsTest, PhiMightNotBeFPMathOperator) {
diff --git a/llvm/unittests/IR/ValueTest.cpp b/llvm/unittests/IR/ValueTest.cpp
index 888595e..4d28fe0 100644
--- a/llvm/unittests/IR/ValueTest.cpp
+++ b/llvm/unittests/IR/ValueTest.cpp
@@ -20,8 +20,6 @@
 #include "gtest/gtest.h"
 using namespace llvm;
 
-LLVM_ABI extern cl::opt<bool> UseNewDbgInfoFormat;
-
 namespace {
 
 TEST(ValueTest, UsedInBasicBlock) {
@@ -257,74 +255,6 @@ TEST(ValueTest, getLocalSlotDeath) {
 
 TEST(ValueTest, replaceUsesOutsideBlock) {
   // Check that Value::replaceUsesOutsideBlock(New, BB) replaces uses outside
-  // BB, including dbg.* uses of MetadataAsValue(ValueAsMetadata(this)).
-  bool OldDbgValueMode = UseNewDbgInfoFormat;
-  UseNewDbgInfoFormat = false;
-  const auto *IR = R"(
-    define i32 @f() !dbg !6 {
-    entry:
-      %a = add i32 0, 1, !dbg !15
-      %b = add i32 0, 2, !dbg !15
-      %c = add i32 %a, 2, !dbg !15
-      call void @llvm.dbg.value(metadata i32 %a, metadata !9, metadata !DIExpression()), !dbg !15
-      br label %exit, !dbg !15
-
-    exit:
-      call void @llvm.dbg.value(metadata i32 %a, metadata !11, metadata !DIExpression()), !dbg !16
-      ret i32 %a, !dbg !16
-    }
-
-    declare void @llvm.dbg.value(metadata, metadata, metadata)
-
-    !llvm.dbg.cu = !{!0}
-    !llvm.module.flags = !{!5}
-
-    !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
-    !1 = !DIFile(filename: "test.ll", directory: "/")
-    !2 = !{}
-    !5 = !{i32 2, !"Debug Info Version", i32 3}
-    !6 = distinct !DISubprogram(name: "f", linkageName: "f", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !8)
-    !7 = !DISubroutineType(types: !2)
-    !8 = !{!9, !11}
-    !9 = !DILocalVariable(name: "1", scope: !6, file: !1, line: 1, type: !10)
-    !10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_signed)
-    !11 = !DILocalVariable(name: "2", scope: !6, file: !1, line: 2, type: !12)
-    !12 = !DIBasicType(name: "ty64", size: 64, encoding: DW_ATE_signed)
-    !15 = !DILocation(line: 1, column: 1, scope: !6)
-    !16 = !DILocation(line: 5, column: 1, scope: !6)
-  )";
-  LLVMContext Ctx;
-  SMDiagnostic Err;
-  std::unique_ptr<Module> M = parseAssemblyString(IR, Err, Ctx);
-  if (!M)
-    Err.print("ValueTest", errs());
-
-  auto GetNext = [](auto *I) { return &*++I->getIterator(); };
-
-  Function *F = M->getFunction("f");
-  // Entry.
-  BasicBlock *Entry = &F->front();
-  Instruction *A = &Entry->front();
-  Instruction *B = GetNext(A);
-  Instruction *C = GetNext(B);
-  auto *EntryDbg = cast<DbgValueInst>(GetNext(C));
-  // Exit.
-  BasicBlock *Exit = GetNext(Entry);
-  auto *ExitDbg = cast<DbgValueInst>(&Exit->front());
-  Instruction *Ret = GetNext(ExitDbg);
-
-  A->replaceUsesOutsideBlock(B, Entry);
-  // These users are in Entry so shouldn't be changed.
-  ASSERT_TRUE(C->getOperand(0) == cast<Value>(A));
-  ASSERT_TRUE(EntryDbg->getValue(0) == cast<Value>(A));
-  // These users are outside Entry so should be changed.
-  ASSERT_TRUE(ExitDbg->getValue(0) == cast<Value>(B));
-  ASSERT_TRUE(Ret->getOperand(0) == cast<Value>(B));
-  UseNewDbgInfoFormat = OldDbgValueMode;
-}
-
-TEST(ValueTest, replaceUsesOutsideBlockDbgVariableRecord) {
-  // Check that Value::replaceUsesOutsideBlock(New, BB) replaces uses outside
   // BB, including DbgVariableRecords.
   const auto *IR = R"(
     define i32 @f() !dbg !6 {
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index 18882ad..33928ac 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -1051,29 +1051,6 @@ define void @foo() {
   auto *Call = cast<sandboxir::CallInst>(&*It++);
   // Check classof(), creation.
   auto *GO = cast<sandboxir::GlobalObject>(Call->getCalledOperand());
-  // Check getAlignment().
-  EXPECT_EQ(GO->getAlignment(), LLVMGO->getAlignment());
-  // Check getAlign().
-  EXPECT_EQ(GO->getAlign(), LLVMGO->getAlign());
-  // Check setAlignment().
-  auto OrigMaybeAlign = GO->getAlign();
-  auto NewMaybeAlign = MaybeAlign(128);
-  EXPECT_NE(NewMaybeAlign, OrigMaybeAlign);
-  GO->setAlignment(NewMaybeAlign);
-  EXPECT_EQ(GO->getAlign(), NewMaybeAlign);
-  GO->setAlignment(OrigMaybeAlign);
-  EXPECT_EQ(GO->getAlign(), OrigMaybeAlign);
-  // Check getGlobalObjectSubClassData().
-  EXPECT_EQ(GO->getGlobalObjectSubClassData(),
-            LLVMGO->getGlobalObjectSubClassData());
-  // Check setGlobalObjectSubClassData().
-  auto OrigGOSCD = GO->getGlobalObjectSubClassData();
-  auto NewGOSCD = 1u;
-  EXPECT_NE(NewGOSCD, OrigGOSCD);
-  GO->setGlobalObjectSubClassData(NewGOSCD);
-  EXPECT_EQ(GO->getGlobalObjectSubClassData(), NewGOSCD);
-  GO->setGlobalObjectSubClassData(OrigGOSCD);
-  EXPECT_EQ(GO->getGlobalObjectSubClassData(), OrigGOSCD);
   // Check hasSection().
   EXPECT_EQ(GO->hasSection(), LLVMGO->hasSection());
   // Check getSection().
@@ -1284,6 +1261,16 @@ define void @foo() {
   EXPECT_EQ(GV0->getCodeModelRaw(), LLVMGV0->getCodeModelRaw());
   // Check getCodeModel().
   EXPECT_EQ(GV0->getCodeModel(), LLVMGV0->getCodeModel());
+  // Check getAlign().
+  EXPECT_EQ(GV0->getAlign(), LLVMGV0->getAlign());
+  // Check setAlignment().
+  auto OrigMaybeAlign = GV0->getAlign();
+  auto NewMaybeAlign = MaybeAlign(128);
+  EXPECT_NE(NewMaybeAlign, OrigMaybeAlign);
+  GV0->setAlignment(NewMaybeAlign);
+  EXPECT_EQ(GV0->getAlign(), NewMaybeAlign);
+  GV0->setAlignment(OrigMaybeAlign);
+  EXPECT_EQ(GV0->getAlign(), OrigMaybeAlign);
 }
 
 TEST_F(SandboxIRTest, GlobalAlias) {
@@ -1855,6 +1842,17 @@ bb1:
 )IR");
   }
 #endif // NDEBUG
+
+  // Check getAlign().
+  EXPECT_EQ(F0->getAlign(), F0->getAlign());
+  // Check setAlignment().
+  auto OrigMaybeAlign = F0->getAlign();
+  auto NewMaybeAlign = MaybeAlign(128);
+  EXPECT_NE(NewMaybeAlign, OrigMaybeAlign);
+  F0->setAlignment(NewMaybeAlign);
+  EXPECT_EQ(F0->getAlign(), NewMaybeAlign);
+  F0->setAlignment(OrigMaybeAlign);
+  EXPECT_EQ(F0->getAlign(), OrigMaybeAlign);
 }
 
 TEST_F(SandboxIRTest, Module) {
diff --git a/llvm/unittests/Support/ARMAttributeParser.cpp b/llvm/unittests/Support/ARMAttributeParser.cpp
index e06f7fa..305f4b3 100644
--- a/llvm/unittests/Support/ARMAttributeParser.cpp
+++ b/llvm/unittests/Support/ARMAttributeParser.cpp
@@ -38,7 +38,7 @@ bool testBuildAttr(unsigned Tag, unsigned Value,
   cantFail(Parser.parse(Bytes, llvm::endianness::little));
 
   std::optional<unsigned> Attr = Parser.getAttributeValue("", ExpectedTag);
-  return Attr && *Attr == ExpectedValue;
+  return Attr == ExpectedValue;
 }
 
 void testParseError(ArrayRef<uint8_t> bytes, const char *msg) {
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
index a87aa8b..d33c0db 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
@@ -899,6 +899,11 @@ TreePredicateFn::TreePredicateFn(TreePattern *N) : PatFragRec(N) {
   assert(
       (!hasPredCode() || !hasImmCode()) &&
       ".td file corrupt: can't have a node predicate *and* an imm predicate");
+
+  if (hasGISelPredicateCode() && hasGISelLeafPredicateCode())
+    PrintFatalError(getOrigPatFragRecord()->getRecord()->getLoc(),
+                    ".td file corrupt: can't have GISelPredicateCode *and* "
+                    "GISelLeafPredicateCode");
 }
 
 bool TreePredicateFn::hasPredCode() const {
@@ -1293,8 +1298,20 @@ bool TreePredicateFn::hasGISelPredicateCode() const {
 }
 
 std::string TreePredicateFn::getGISelPredicateCode() const {
-  return std::string(
-      PatFragRec->getRecord()->getValueAsString("GISelPredicateCode"));
+  return PatFragRec->getRecord()->getValueAsString("GISelPredicateCode").str();
+}
+
+bool TreePredicateFn::hasGISelLeafPredicateCode() const {
+  return PatFragRec->getRecord()
+      ->getValueAsOptionalString("GISelLeafPredicateCode")
+      .has_value();
+}
+
+std::string TreePredicateFn::getGISelLeafPredicateCode() const {
+  return PatFragRec->getRecord()
+      ->getValueAsOptionalString("GISelLeafPredicateCode")
+      .value_or(StringRef())
+      .str();
 }
 
 StringRef TreePredicateFn::getImmType() const {
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
index 725414f..a5aadf2 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
@@ -590,6 +590,11 @@ public:
   bool hasGISelPredicateCode() const;
   std::string getGISelPredicateCode() const;
 
+  // If true, indicates that GlobalISel-based C++ code was supplied for checking
+  // register operands.
+  bool hasGISelLeafPredicateCode() const;
+  std::string getGISelLeafPredicateCode() const;
+
 private:
   bool hasPredCode() const;
   bool hasImmCode() const;
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
index 8132f99..5ec9b35 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
@@ -2623,6 +2623,55 @@ CodeGenRegBank::getMinimalPhysRegClass(const Record *RegRecord,
   return BestRC;
 }
 
+const CodeGenRegisterClass *
+CodeGenRegBank::getSuperRegForSubReg(const ValueTypeByHwMode &ValueTy,
+                                     const CodeGenSubRegIndex *SubIdx,
+                                     bool MustBeAllocatable) const {
+  std::vector<const CodeGenRegisterClass *> Candidates;
+  auto &RegClasses = getRegClasses();
+
+  // Try to find a register class which supports ValueTy, and also contains
+  // SubIdx.
+  for (const CodeGenRegisterClass &RC : RegClasses) {
+    // Is there a subclass of this class which contains this subregister index?
+    const CodeGenRegisterClass *SubClassWithSubReg =
+        RC.getSubClassWithSubReg(SubIdx);
+    if (!SubClassWithSubReg)
+      continue;
+
+    // We have a class. Check if it supports this value type.
+    if (!llvm::is_contained(SubClassWithSubReg->VTs, ValueTy))
+      continue;
+
+    // If necessary, check that it is allocatable.
+    if (MustBeAllocatable && !SubClassWithSubReg->Allocatable)
+      continue;
+
+    // We have a register class which supports both the value type and
+    // subregister index. Remember it.
+    Candidates.push_back(SubClassWithSubReg);
+  }
+
+  // If we didn't find anything, we're done.
+  if (Candidates.empty())
+    return nullptr;
+
+  // Find and return the largest of our candidate classes.
+  llvm::stable_sort(Candidates, [&](const CodeGenRegisterClass *A,
+                                    const CodeGenRegisterClass *B) {
+    if (A->getMembers().size() > B->getMembers().size())
+      return true;
+
+    if (A->getMembers().size() < B->getMembers().size())
+      return false;
+
+    // Order by name as a tie-breaker.
+    return StringRef(A->getName()) < B->getName();
+  });
+
+  return Candidates[0];
+}
+
 BitVector
 CodeGenRegBank::computeCoveredRegisters(ArrayRef<const Record *> Regs) {
   SetVector<const CodeGenRegister *> Set;
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.h b/llvm/utils/TableGen/Common/CodeGenRegisters.h
index 75d9a3f7..3f4c157 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.h
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.h
@@ -831,6 +831,13 @@ public:
   getMinimalPhysRegClass(const Record *RegRecord,
                          ValueTypeByHwMode *VT = nullptr);
 
+  /// Return the largest register class which supports \p Ty and covers \p
+  /// SubIdx if it exists.
+  const CodeGenRegisterClass *
+  getSuperRegForSubReg(const ValueTypeByHwMode &Ty,
+                       const CodeGenSubRegIndex *SubIdx,
+                       bool MustBeAllocatable = false) const;
+
   // Get the sum of unit weights.
   unsigned getRegUnitSetWeight(const std::vector<unsigned> &Units) const {
     unsigned Weight = 0;
diff --git a/llvm/utils/TableGen/Common/CodeGenTarget.cpp b/llvm/utils/TableGen/Common/CodeGenTarget.cpp
index 303589d..f519582 100644
--- a/llvm/utils/TableGen/Common/CodeGenTarget.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenTarget.cpp
@@ -160,54 +160,6 @@ CodeGenRegBank &CodeGenTarget::getRegBank() const {
   return *RegBank;
 }
 
-const CodeGenRegisterClass *CodeGenTarget::getSuperRegForSubReg(
-    const ValueTypeByHwMode &ValueTy, CodeGenRegBank &RegBank,
-    const CodeGenSubRegIndex *SubIdx, bool MustBeAllocatable) const {
-  std::vector<const CodeGenRegisterClass *> Candidates;
-  auto &RegClasses = RegBank.getRegClasses();
-
-  // Try to find a register class which supports ValueTy, and also contains
-  // SubIdx.
-  for (const CodeGenRegisterClass &RC : RegClasses) {
-    // Is there a subclass of this class which contains this subregister index?
-    const CodeGenRegisterClass *SubClassWithSubReg =
-        RC.getSubClassWithSubReg(SubIdx);
-    if (!SubClassWithSubReg)
-      continue;
-
-    // We have a class. Check if it supports this value type.
-    if (!llvm::is_contained(SubClassWithSubReg->VTs, ValueTy))
-      continue;
-
-    // If necessary, check that it is allocatable.
-    if (MustBeAllocatable && !SubClassWithSubReg->Allocatable)
-      continue;
-
-    // We have a register class which supports both the value type and
-    // subregister index. Remember it.
-    Candidates.push_back(SubClassWithSubReg);
-  }
-
-  // If we didn't find anything, we're done.
-  if (Candidates.empty())
-    return nullptr;
-
-  // Find and return the largest of our candidate classes.
-  llvm::stable_sort(Candidates, [&](const CodeGenRegisterClass *A,
-                                    const CodeGenRegisterClass *B) {
-    if (A->getMembers().size() > B->getMembers().size())
-      return true;
-
-    if (A->getMembers().size() < B->getMembers().size())
-      return false;
-
-    // Order by name as a tie-breaker.
-    return StringRef(A->getName()) < B->getName();
-  });
-
-  return Candidates[0];
-}
-
 /// getRegisterByName - If there is a register with the specific AsmName,
 /// return it.
 const CodeGenRegister *CodeGenTarget::getRegisterByName(StringRef Name) const {
diff --git a/llvm/utils/TableGen/Common/CodeGenTarget.h b/llvm/utils/TableGen/Common/CodeGenTarget.h
index da2f3e0..52871f3 100644
--- a/llvm/utils/TableGen/Common/CodeGenTarget.h
+++ b/llvm/utils/TableGen/Common/CodeGenTarget.h
@@ -122,13 +122,6 @@ public:
   /// getRegBank - Return the register bank description.
   CodeGenRegBank &getRegBank() const;
 
-  /// Return the largest register class on \p RegBank which supports \p Ty and
-  /// covers \p SubIdx if it exists.
-  const CodeGenRegisterClass *
-  getSuperRegForSubReg(const ValueTypeByHwMode &Ty, CodeGenRegBank &RegBank,
-                       const CodeGenSubRegIndex *SubIdx,
-                       bool MustBeAllocatable = false) const;
-
   /// getRegisterByName - If there is a register with the specific AsmName,
   /// return it.
   const CodeGenRegister *getRegisterByName(StringRef Name) const;
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
index 2cb3579..327ac5f 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
@@ -33,6 +33,8 @@ Error failUnsupported(const Twine &Reason) {
 std::string getEnumNameForPredicate(const TreePredicateFn &Predicate) {
   if (Predicate.hasGISelPredicateCode())
     return "GICXXPred_MI_" + Predicate.getFnName();
+  if (Predicate.hasGISelLeafPredicateCode())
+    return "GICXXPred_MO_" + Predicate.getFnName();
   return "GICXXPred_" + Predicate.getImmTypeIdentifier().str() + "_" +
          Predicate.getFnName();
 }
@@ -1326,6 +1328,19 @@ void OperandImmPredicateMatcher::emitPredicateOpcodes(MatchTable &Table,
         << MatchTable::LineBreak;
 }
 
+//===- OperandLeafPredicateMatcher
+//-----------------------------------------===//
+
+void OperandLeafPredicateMatcher::emitPredicateOpcodes(
+    MatchTable &Table, RuleMatcher &Rule) const {
+  Table << MatchTable::Opcode("GIM_CheckLeafOperandPredicate")
+        << MatchTable::Comment("MI") << MatchTable::ULEB128Value(InsnVarID)
+        << MatchTable::Comment("MO") << MatchTable::ULEB128Value(OpIdx)
+        << MatchTable::Comment("Predicate")
+        << MatchTable::NamedValue(2, getEnumNameForPredicate(Predicate))
+        << MatchTable::LineBreak;
+}
+
 //===- OperandMatcher -----------------------------------------------------===//
 
 std::string OperandMatcher::getOperandExpr(unsigned InsnVarID) const {
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
index fd24459..6647257 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
@@ -824,6 +824,7 @@ public:
     IPM_OneUse,
     IPM_GenericPredicate,
     IPM_MIFlags,
+    OPM_LeafPredicate,
     OPM_SameOperand,
     OPM_ComplexPattern,
     OPM_IntrinsicID,
@@ -1255,6 +1256,26 @@ public:
                             RuleMatcher &Rule) const override;
 };
 
+/// Generates code to check that this operand is a register whose value meets
+/// the predicate.
+class OperandLeafPredicateMatcher : public OperandPredicateMatcher {
+protected:
+  TreePredicateFn Predicate;
+
+public:
+  OperandLeafPredicateMatcher(unsigned InsnVarID, unsigned OpIdx,
+                              const TreePredicateFn &Predicate)
+      : OperandPredicateMatcher(OPM_LeafPredicate, InsnVarID, OpIdx),
+        Predicate(Predicate) {}
+
+  static bool classof(const PredicateMatcher *P) {
+    return P->getKind() == OPM_LeafPredicate;
+  }
+
+  void emitPredicateOpcodes(MatchTable &Table,
+                            RuleMatcher &Rule) const override;
+};
+
 /// Generates code to check that a set of predicates match for a particular
 /// operand.
 class OperandMatcher : public PredicateListMatcher<OperandPredicateMatcher> {
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.cpp
index ffab2fd..333d956 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.cpp
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.cpp
@@ -182,6 +182,7 @@ void GlobalISelMatchTableExecutorEmitter::emitExecutorImpl(
   emitSubtargetFeatureBitsetImpl(OS, Rules);
   emitComplexPredicates(OS, ComplexOperandMatchers);
   emitMIPredicateFns(OS);
+  emitLeafPredicateFns(OS);
   emitI64ImmPredicateFns(OS);
   emitAPFloatImmPredicateFns(OS);
   emitAPIntImmPredicateFns(OS);
@@ -234,6 +235,9 @@ void GlobalISelMatchTableExecutorEmitter::emitTemporariesDecl(
      << "  bool testMIPredicate_MI(unsigned PredicateID, const MachineInstr &MI"
         ", const MatcherState &State) "
         "const override;\n"
+     << "  bool testMOPredicate_MO(unsigned PredicateID, const MachineOperand "
+        "&MO, const MatcherState &State) "
+        "const override;\n"
      << "  bool testSimplePredicate(unsigned PredicateID) const override;\n"
      << "  bool runCustomAction(unsigned FnID, const MatcherState &State, "
         "NewMIVector &OutMIs) "
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.h
index 862f1e8..1f66d73 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.h
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.h
@@ -79,8 +79,8 @@ class GlobalISelMatchTableExecutorEmitter {
       raw_ostream &OS, StringRef TypeIdentifier, StringRef ArgType,
       StringRef ArgName, StringRef AdditionalArgs,
       StringRef AdditionalDeclarations, ArrayRef<PredicateObject> Predicates,
-      std::function<StringRef(PredicateObject)> GetPredEnumName,
-      std::function<StringRef(PredicateObject)> GetPredCode,
+      llvm::function_ref<StringRef(PredicateObject)> GetPredEnumName,
+      llvm::function_ref<StringRef(PredicateObject)> GetPredCode,
       StringRef Comment) {
     if (!Comment.empty())
       OS << "// " << Comment << "\n";
@@ -135,14 +135,34 @@ protected:
   void emitMIPredicateFnsImpl(
       raw_ostream &OS, StringRef AdditionalDecls,
       ArrayRef<PredicateObject> Predicates,
-      std::function<StringRef(PredicateObject)> GetPredEnumName,
-      std::function<StringRef(PredicateObject)> GetPredCode,
+      llvm::function_ref<StringRef(PredicateObject)> GetPredEnumName,
+      llvm::function_ref<StringRef(PredicateObject)> GetPredCode,
       StringRef Comment = "") {
     return emitCxxPredicateFns(
         OS, "MI", "const MachineInstr &", "MI", ", const MatcherState &State",
         AdditionalDecls, Predicates, GetPredEnumName, GetPredCode, Comment);
   }
 
+  /// Emits `testMOPredicate_MO`.
+  /// \tparam PredicateObject An object representing a predicate to emit.
+  /// \param OS Output stream.
+  /// \param AdditionalDecls Additional C++ variable declarations.
+  /// \param Predicates Predicates to emit.
+  /// \param GetPredEnumName Returns an enum name for a given predicate.
+  /// \param GetPredCode Returns the C++ code of a given predicate.
+  /// \param Comment Optional comment for the enum declaration.
+  template <typename PredicateObject>
+  void emitLeafPredicateFnsImpl(
+      raw_ostream &OS, StringRef AdditionalDecls,
+      ArrayRef<PredicateObject> Predicates,
+      llvm::function_ref<StringRef(PredicateObject)> GetPredEnumName,
+      llvm::function_ref<StringRef(PredicateObject)> GetPredCode,
+      StringRef Comment = "") {
+    return emitCxxPredicateFns(
+        OS, "MO", "const MachineOperand &", "MO", ", const MatcherState &State",
+        AdditionalDecls, Predicates, GetPredEnumName, GetPredCode, Comment);
+  }
+
   /// Helper function to emit the following executor functions:
   ///   * testImmPredicate_I64      (TypeIdentifier=I64)
   ///   * testImmPredicate_APInt    (TypeIdentifier=APInt)
@@ -160,8 +180,8 @@ protected:
   void emitImmPredicateFnsImpl(
       raw_ostream &OS, StringRef TypeIdentifier, StringRef ArgType,
       ArrayRef<PredicateObject> Predicates,
-      std::function<StringRef(PredicateObject)> GetPredEnumName,
-      std::function<StringRef(PredicateObject)> GetPredCode,
+      llvm::function_ref<StringRef(PredicateObject)> GetPredEnumName,
+      llvm::function_ref<StringRef(PredicateObject)> GetPredCode,
       StringRef Comment = "") {
     return emitCxxPredicateFns(OS, TypeIdentifier, ArgType, "Imm", "", "",
                                Predicates, GetPredEnumName, GetPredCode,
@@ -189,6 +209,10 @@ public:
   /// Note: `emitMIPredicateFnsImpl` can be used to do most of the work.
   virtual void emitMIPredicateFns(raw_ostream &OS) = 0;
 
+  /// Emit the `testLeafPredicate` function
+  /// Note `emitLeafPredicateFnsImpl` can be used to do most of the work.
+  virtual void emitLeafPredicateFns(raw_ostream &OS) = 0;
+
   /// Emit the `testImmPredicate_I64` function.
   /// Note: `emitImmPredicateFnsImpl` can be used to do most of the work.
   virtual void emitI64ImmPredicateFns(raw_ostream &OS) = 0;
diff --git a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
index afaf050..f62b265 100644
--- a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
@@ -2414,6 +2414,7 @@ class GICombinerEmitter final : public GlobalISelMatchTableExecutorEmitter {
   void emitAdditionalImpl(raw_ostream &OS) override;
 
   void emitMIPredicateFns(raw_ostream &OS) override;
+  void emitLeafPredicateFns(raw_ostream &OS) override;
   void emitI64ImmPredicateFns(raw_ostream &OS) override;
   void emitAPFloatImmPredicateFns(raw_ostream &OS) override;
   void emitAPIntImmPredicateFns(raw_ostream &OS) override;
@@ -2581,6 +2582,12 @@ void GICombinerEmitter::emitMIPredicateFns(raw_ostream &OS) {
       [](const CXXPredicateCode *C) -> StringRef { return C->Code; });
 }
 
+void GICombinerEmitter::emitLeafPredicateFns(raw_ostream &OS) {
+  // Unused, but still needs to be called.
+  emitLeafPredicateFnsImpl<unsigned>(
+      OS, "", {}, [](unsigned) { return ""; }, [](unsigned) { return ""; });
+}
+
 void GICombinerEmitter::emitI64ImmPredicateFns(raw_ostream &OS) {
   // Unused, but still needs to be called.
   emitImmPredicateFnsImpl<unsigned>(
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index 55f60db..6772043 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -321,6 +321,7 @@ public:
   void emitAdditionalImpl(raw_ostream &OS) override;
 
   void emitMIPredicateFns(raw_ostream &OS) override;
+  void emitLeafPredicateFns(raw_ostream &OS) override;
   void emitI64ImmPredicateFns(raw_ostream &OS) override;
   void emitAPFloatImmPredicateFns(raw_ostream &OS) override;
   void emitAPIntImmPredicateFns(raw_ostream &OS) override;
@@ -1110,8 +1111,16 @@ Error GlobalISelEmitter::importChildMatcher(
     return Error::success();
   }
 
-  if (SrcChild.hasAnyPredicate())
-    return failedImport("Src pattern child has unsupported predicate");
+  if (SrcChild.hasAnyPredicate()) {
+    for (const TreePredicateCall &Call : SrcChild.getPredicateCalls()) {
+      const TreePredicateFn &Predicate = Call.Fn;
+
+      if (!Predicate.hasGISelLeafPredicateCode())
+        return failedImport("Src pattern child has unsupported predicate");
+      OM.addPredicate<OperandLeafPredicateMatcher>(Predicate);
+    }
+    return Error::success();
+  }
 
   // Check for constant immediates.
   if (auto *ChildInt = dyn_cast<IntInit>(SrcChild.getLeafValue())) {
@@ -2010,7 +2019,7 @@ const CodeGenRegisterClass *GlobalISelEmitter::inferSuperRegisterClass(
 
   // Use the information we found above to find a minimal register class which
   // supports the subregister and type we want.
-  return Target.getSuperRegForSubReg(Ty.getValueTypeByHwMode(), CGRegs, SubIdx,
+  return CGRegs.getSuperRegForSubReg(Ty.getValueTypeByHwMode(), SubIdx,
                                      /*MustBeAllocatable=*/true);
 }
 
@@ -2293,6 +2302,27 @@ void GlobalISelEmitter::emitMIPredicateFns(raw_ostream &OS) {
       "PatFrag predicates.");
 }
 
+void GlobalISelEmitter::emitLeafPredicateFns(raw_ostream &OS) {
+  std::vector<const Record *> MatchedRecords;
+  llvm::copy_if(AllPatFrags, std::back_inserter(MatchedRecords),
+                [](const Record *R) {
+                  return (!R->getValueAsOptionalString("GISelLeafPredicateCode")
+                               .value_or(std::string())
+                               .empty());
+                });
+  emitLeafPredicateFnsImpl<const Record *>(
+      OS,
+      "  const auto &Operands = State.RecordedOperands;\n"
+      "  Register Reg = MO.getReg();\n"
+      "  (void)Operands;\n"
+      "  (void)Reg;",
+      ArrayRef<const Record *>(MatchedRecords), &getPatFragPredicateEnumName,
+      [](const Record *R) {
+        return R->getValueAsString("GISelLeafPredicateCode");
+      },
+      "PatFrag predicates.");
+}
+
 void GlobalISelEmitter::emitI64ImmPredicateFns(raw_ostream &OS) {
   std::vector<const Record *> MatchedRecords;
   llvm::copy_if(AllPatFrags, std::back_inserter(MatchedRecords),
diff --git a/mlir/include/mlir/Debug/BreakpointManagers/TagBreakpointManager.h b/mlir/include/mlir/Debug/BreakpointManagers/TagBreakpointManager.h
index 85fdb9a..af138da 100644
--- a/mlir/include/mlir/Debug/BreakpointManagers/TagBreakpointManager.h
+++ b/mlir/include/mlir/Debug/BreakpointManagers/TagBreakpointManager.h
@@ -48,7 +48,7 @@ public:
   /// If a breakpoint already exists for the given tag, return the existing
   /// instance.
   TagBreakpoint *addBreakpoint(StringRef tag) {
-    auto result = breakpoints.insert({tag, nullptr});
+    auto result = breakpoints.try_emplace(tag);
     auto &it = result.first;
     if (result.second)
       it->second = std::make_unique<TagBreakpoint>(tag.str());
diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
index 599b3b9..adc27ae 100644
--- a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
+++ b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
@@ -1216,6 +1216,58 @@ def Arith_ExtFOp : Arith_FToFCastOp<"extf", [DeclareOpInterfaceMethods<ArithFast
 }
 
 //===----------------------------------------------------------------------===//
+// Scaling ExtFOp
+//===----------------------------------------------------------------------===//
+def Arith_ScalingExtFOp
+    : Arith_Op<
+          "scaling_extf", [Pure, SameInputOutputTensorDims,
+                           DeclareOpInterfaceMethods<ArithFastMathInterface>,
+                           DeclareOpInterfaceMethods<CastOpInterface>]>,
+      Arguments<(ins FloatLike:$in, FloatLike:$scale,
+          OptionalAttr<Arith_FastMathAttr>:$fastmath)>,
+      Results<(outs FloatLike:$out)> {
+  let summary = "Upcasts input floats using provided scales values following "
+                "OCP MXFP Spec";
+  let description = [{
+  This operation upcasts input floating-point values using provided scale 
+  values. It expects both scales and the input operand to be of the same shape, 
+  making the operation elementwise. Scales are usually calculated per block 
+  following the OCP MXFP spec as described in https://arxiv.org/abs/2310.10537.
+
+  If scales are calculated per block where blockSize != 1, then scales may 
+  require broadcasting to make this operation elementwise. For example, let's 
+  say the input is of shape `<dim1 x dim2 x ... dimN>`. Given blockSize != 1 and 
+  assuming quantization happens on the last axis, the input can be reshaped to 
+  `<dim1 x dim2 x ... (dimN/blockSize) x blockSize>`. Scales will be calculated 
+  per block on the last axis. Therefore, scales will be of shape 
+  `<dim1 x dim2 x ... (dimN/blockSize) x 1>`. Scales could also be of some other 
+  shape as long as it is broadcast compatible with the input, e.g., 
+  `<1 x 1 x ... (dimN/blockSize) x 1>`.
+
+  In this example, before calling into `arith.scaling_extf`, scales must be 
+  broadcasted to `<dim1 x dim2 x dim3 ... (dimN/blockSize) x blockSize>`. Note 
+  that there could be multiple quantization axes. Internally, 
+  `arith.scaling_extf` would perform the following:
+ 
+    ```
+    resultTy = get_type(result) 
+    scaleTy  = get_type(scale)
+    inputTy = get_type(input)
+    scale.exponent = arith.truncf(scale) : scaleTy to f8E8M0
+    scale.extf = arith.extf(scale.exponent) : f8E8M0 to resultTy
+    input.extf = arith.extf(input) : inputTy to resultTy
+    result = arith.mulf(scale.extf, input.extf)
+    ```
+    It propagates NaN values. Therefore, if either scale or the input element 
+    contains NaN, then the output element value will also be a NaN.
+  }];
+  let hasVerifier = 1;
+  let assemblyFormat =
+      [{ $in `,` $scale (`fastmath` `` $fastmath^)? attr-dict `:` 
+      type($in) `,` type($scale) `to` type($out)}];
+}
+
+//===----------------------------------------------------------------------===//
 // TruncIOp
 //===----------------------------------------------------------------------===//
 
@@ -1281,6 +1333,63 @@ def Arith_TruncFOp :
 }
 
 //===----------------------------------------------------------------------===//
+// Scaling TruncFOp
+//===----------------------------------------------------------------------===//
+
+def Arith_ScalingTruncFOp
+    : Arith_Op<"scaling_truncf",
+               [Pure, SameInputOutputTensorDims,
+                DeclareOpInterfaceMethods<ArithRoundingModeInterface>,
+                DeclareOpInterfaceMethods<ArithFastMathInterface>,
+                DeclareOpInterfaceMethods<CastOpInterface>]>,
+      Arguments<(ins FloatLike:$in, FloatLike:$scale,
+          OptionalAttr<Arith_RoundingModeAttr>:$roundingmode,
+          OptionalAttr<Arith_FastMathAttr>:$fastmath)>,
+      Results<(outs FloatLike:$out)> {
+  let summary = "Downcasts input floating point values using provided scales "
+                "values following OCP MXFP Spec";
+  let description = [{
+    This operation downcasts input using the provided scale values. It expects 
+    both scales and the input operand to be of the same shape and, therefore, 
+    makes the operation elementwise. Scales are usually calculated per block 
+    following the OCP MXFP spec as described in https://arxiv.org/abs/2310.10537.
+    Users are required to normalize and clamp the scales as necessary before calling
+    passing them to this operation.  OCP MXFP spec also does the flushing of denorms
+    on the input operand, which should be handled during lowering by passing appropriate 
+    fastMath flag to this operation. 
+
+    If scales are calculated per block where blockSize != 1, scales may require 
+    broadcasting to make this operation elementwise. For example, let's say the 
+    input is of shape `<dim1 x dim2 x ... dimN>`. Given blockSize != 1 and 
+    assuming quantization happens on the last axis, the input can be reshaped to 
+    `<dim1 x dim2 x ... (dimN/blockSize) x blockSize>`. Scales will be calculated 
+    per block on the last axis. Therefore, scales will be of shape 
+    `<dim1 x dim2 x ... (dimN/blockSize) x 1>`. Scales could also be of some other 
+    shape as long as it is broadcast compatible with the input, e.g., 
+    `<1 x 1 x ... (dimN/blockSize) x 1>`.
+
+    In this example, before calling into `arith.scaling_truncf`, scales must be 
+    broadcasted to `<dim1 x dim2 x dim3 ... (dimN/blockSize) x blockSize>`. Note 
+    that there could be multiple quantization axes. Internally, 
+    `arith.scaling_truncf` would perform the following:
+
+    ```
+    scaleTy = get_type(scale)
+    inputTy = get_type(input)
+    resultTy = get_type(result)
+    scale.exponent = arith.truncf(scale) : scaleTy to f8E8M0
+    scale.extf = arith.extf(scale.exponent) : f8E8M0 to inputTy
+    result = arith.divf(input, scale.extf)
+    result.cast = arith.truncf(result, resultTy)
+    ```
+  }];
+  let hasVerifier = 1;
+  let assemblyFormat =
+      [{ $in `,` $scale ($roundingmode^)? (`fastmath` `` $fastmath^)? attr-dict `:` 
+      type($in) `,` type($scale) `to` type($out)}];
+}
+
+//===----------------------------------------------------------------------===//
 // UIToFPOp
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
index 5aaac8d..e0a4567 100644
--- a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
@@ -62,6 +62,9 @@ void populateExpandBFloat16Patterns(RewritePatternSet &patterns);
 /// Add patterns to expand Arith f8e8m0 patterns to lower level bitcasts/shifts.
 void populateExpandF8E8M0Patterns(RewritePatternSet &patterns);
 
+/// Add patterns to expand scaling ExtF/TruncF ops to equivalent arith ops
+void populateExpandScalingExtTruncPatterns(RewritePatternSet &patterns);
+
 /// Add patterns to expand Arith ops.
 void populateArithExpandOpsPatterns(RewritePatternSet &patterns);
 
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td
index 22d5afc..309079e 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td
@@ -23,7 +23,7 @@ class SPIRV_ArithmeticBinaryOp<string mnemonic, Type type,
       // Operands type same as result type.
       SPIRV_BinaryOp<mnemonic, type, type,
                    !listconcat(traits,
-                               [Pure, SameOperandsAndResultType])> {
+                               [Pure, AllTypesMatch<["operand1", "operand2", "result"]>])> {
   // In addition to normal types arithmetic instructions can support cooperative
   // matrix.
   let arguments = (ins
@@ -42,7 +42,7 @@ class SPIRV_ArithmeticUnaryOp<string mnemonic, Type type,
       // Operand type same as result type.
       SPIRV_UnaryOp<mnemonic, type, type,
                    !listconcat(traits,
-                               [Pure, SameOperandsAndResultType])> {
+                               [Pure, AllTypesMatch<["operand", "result"]>])> {
   // In addition to normal types arithmetic instructions can support cooperative
   // matrix.
   let arguments = (ins
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
index 057dfac..feae817 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
@@ -767,6 +767,77 @@ def SPIRV_GLTanhOp : SPIRV_GLUnaryArithmeticOp<"Tanh", 21, SPIRV_Float16or32> {
 
 // -----
 
+def SPIRV_GLAsinhOp : SPIRV_GLUnaryArithmeticOp<"Asinh", 22, SPIRV_Float16or32> {
+  let summary = "Arc hyperbolic sine of operand in radians.";
+
+  let description = [{
+    Arc hyperbolic sine; result is the inverse of sinh.
+
+    The operand x must be a scalar or vector whose component type is 16-bit or
+    32-bit floating-point.
+
+    Result Type and the type of x must be the same type. Results are computed
+    per component.
+
+    #### Example:
+
+    ```mlir
+    %2 = spirv.GL.Asinh %0 : f32
+    %3 = spirv.GL.Asinh %1 : vector<3xf16>
+    ```
+  }];
+}
+
+// -----
+
+def SPIRV_GLAcoshOp : SPIRV_GLUnaryArithmeticOp<"Acosh", 23, SPIRV_Float16or32> {
+  let summary = "Arc hyperbolic cosine of operand in radians.";
+
+  let description = [{
+    Arc hyperbolic cosine; result is the non-negative inverse of cosh. The resulting
+    value is NaN if x < 1.
+
+    The operand x must be a scalar or vector whose component type is 16-bit or
+    32-bit floating-point.
+
+    Result Type and the type of x must be the same type. Results are computed
+    per component.
+
+    #### Example:
+
+    ```mlir
+    %2 = spirv.GL.Acosh %0 : f32
+    %3 = spirv.GL.Acosh %1 : vector<3xf16>
+    ```
+  }];
+}
+
+// -----
+
+def SPIRV_GLAtanhOp : SPIRV_GLUnaryArithmeticOp<"Atanh", 24, SPIRV_Float16or32> {
+  let summary = "Arc hyperbolic tangent of operand in radians.";
+
+  let description = [{
+    Arc hyperbolic tangent; result is the inverse of tanh. The resulting value
+    is NaN if abs x ≥ 1.
+
+    The operand x must be a scalar or vector whose component type is 16-bit or
+    32-bit floating-point.
+
+    Result Type and the type of x must be the same type. Results are computed
+    per component.
+
+    #### Example:
+
+    ```mlir
+    %2 = spirv.GL.Atanh %0 : f32
+    %3 = spirv.GL.Atanh %1 : vector<3xf16>
+    ```
+  }];
+}
+
+// -----
+
 def SPIRV_GLFClampOp : SPIRV_GLTernaryArithmeticOp<"FClamp", 43, SPIRV_Float> {
   let summary = "Clamp x between min and max values.";
 
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVTypes.h b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVTypes.h
index 2e29e9af..787535d 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVTypes.h
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVTypes.h
@@ -394,7 +394,8 @@ hash_value(const StructType::MemberDecorationInfo &memberDecorationInfo);
 // SPIR-V KHR cooperative matrix type
 class CooperativeMatrixType
     : public Type::TypeBase<CooperativeMatrixType, CompositeType,
-                            detail::CooperativeMatrixTypeStorage> {
+                            detail::CooperativeMatrixTypeStorage,
+                            ShapedType::Trait> {
 public:
   using Base::Base;
 
@@ -418,6 +419,22 @@ public:
                      std::optional<StorageClass> storage = std::nullopt);
   void getCapabilities(SPIRVType::CapabilityArrayRefVector &capabilities,
                        std::optional<StorageClass> storage = std::nullopt);
+
+  operator ShapedType() const { return llvm::cast<ShapedType>(*this); }
+
+  ArrayRef<int64_t> getShape() const;
+
+  bool hasRank() const { return true; }
+
+  CooperativeMatrixType cloneWith(std::optional<ArrayRef<int64_t>> shape,
+                                  Type elementType) const {
+    if (!shape)
+      return get(elementType, getRows(), getColumns(), getScope(), getUse());
+
+    assert(shape.value().size() == 2);
+    return get(elementType, shape.value()[0], shape.value()[1], getScope(),
+               getUse());
+  }
 };
 
 // SPIR-V matrix type
diff --git a/mlir/include/mlir/IR/Builders.h b/mlir/include/mlir/IR/Builders.h
index 3f7b326..d68dbdb 100644
--- a/mlir/include/mlir/IR/Builders.h
+++ b/mlir/include/mlir/IR/Builders.h
@@ -60,6 +60,7 @@ public:
                        Attribute metadata = Attribute());
 
   // Types.
+  FloatType getF8E8M0Type();
   FloatType getBF16Type();
   FloatType getF16Type();
   FloatType getTF32Type();
diff --git a/mlir/include/mlir/Tools/lsp-server-support/Protocol.h b/mlir/include/mlir/Tools/lsp-server-support/Protocol.h
index 5d2eb01..cc06dbf 100644
--- a/mlir/include/mlir/Tools/lsp-server-support/Protocol.h
+++ b/mlir/include/mlir/Tools/lsp-server-support/Protocol.h
@@ -158,6 +158,12 @@ struct ClientCapabilities {
   /// Client supports CodeAction return value for textDocument/codeAction.
   /// textDocument.codeAction.codeActionLiteralSupport.
   bool codeActionStructure = false;
+
+  /// Client supports server-initiated progress via the
+  /// window/workDoneProgress/create method.
+  ///
+  /// window.workDoneProgress
+  bool workDoneProgress = false;
 };
 
 /// Add support for JSON serialization.
diff --git a/mlir/lib/Bytecode/Writer/IRNumbering.cpp b/mlir/lib/Bytecode/Writer/IRNumbering.cpp
index 1bc02e1..8e8e714 100644
--- a/mlir/lib/Bytecode/Writer/IRNumbering.cpp
+++ b/mlir/lib/Bytecode/Writer/IRNumbering.cpp
@@ -308,7 +308,7 @@ void IRNumberingState::computeGlobalNumberingState(Operation *rootOp) {
 }
 
 void IRNumberingState::number(Attribute attr) {
-  auto it = attrs.insert({attr, nullptr});
+  auto it = attrs.try_emplace(attr);
   if (!it.second) {
     ++it.first->second->refCount;
     return;
@@ -475,7 +475,7 @@ void IRNumberingState::number(OperationName opName) {
 }
 
 void IRNumberingState::number(Type type) {
-  auto it = types.insert({type, nullptr});
+  auto it = types.try_emplace(type);
   if (!it.second) {
     ++it.first->second->refCount;
     return;
diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
index 41f2d0f..9e53e19 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
@@ -1452,6 +1452,19 @@ bool arith::ExtFOp::areCastCompatible(TypeRange inputs, TypeRange outputs) {
 LogicalResult arith::ExtFOp::verify() { return verifyExtOp<FloatType>(*this); }
 
 //===----------------------------------------------------------------------===//
+// ScalingExtFOp
+//===----------------------------------------------------------------------===//
+
+bool arith::ScalingExtFOp::areCastCompatible(TypeRange inputs,
+                                             TypeRange outputs) {
+  return checkWidthChangeCast<std::greater, FloatType>(inputs.front(), outputs);
+}
+
+LogicalResult arith::ScalingExtFOp::verify() {
+  return verifyExtOp<FloatType>(*this);
+}
+
+//===----------------------------------------------------------------------===//
 // TruncIOp
 //===----------------------------------------------------------------------===//
 
@@ -1566,6 +1579,19 @@ LogicalResult arith::TruncFOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
+// ScalingTruncFOp
+//===----------------------------------------------------------------------===//
+
+bool arith::ScalingTruncFOp::areCastCompatible(TypeRange inputs,
+                                               TypeRange outputs) {
+  return checkWidthChangeCast<std::less, FloatType>(inputs.front(), outputs);
+}
+
+LogicalResult arith::ScalingTruncFOp::verify() {
+  return verifyTruncateOp<FloatType>(*this);
+}
+
+//===----------------------------------------------------------------------===//
 // AndIOp
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
index 95546bb..534aff9 100644
--- a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
@@ -6,10 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Arith/Transforms/Passes.h"
-
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Transforms/Passes.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -31,7 +31,6 @@ static Value createConst(Location loc, Type type, int value,
     return rewriter.create<arith::ConstantOp>(
         loc, DenseElementsAttr::get(shapedTy, attr));
   }
-
   return rewriter.create<arith::ConstantOp>(loc, attr);
 }
 
@@ -357,9 +356,10 @@ struct F8E8M0ExtFOpConverter : public OpRewritePattern<arith::ExtFOp> {
     f32Bits = b.create<arith::SelectOp>(isNan, cF32NaN, f32Bits);
     Value result = b.create<arith::BitcastOp>(f32Ty, f32Bits);
     if (resultETy.getIntOrFloatBitWidth() < 32) {
-      result = b.create<arith::TruncFOp>(resultTy, result);
+      result = b.create<arith::TruncFOp>(resultTy, result, nullptr,
+                                         op.getFastmathAttr());
     } else if (resultETy.getIntOrFloatBitWidth() > 32) {
-      result = b.create<arith::ExtFOp>(resultTy, result);
+      result = b.create<arith::ExtFOp>(resultTy, result, op.getFastmathAttr());
     }
     rewriter.replaceOp(op, result);
     return success();
@@ -395,9 +395,10 @@ struct F8E8M0TruncFOpConverter : public OpRewritePattern<arith::TruncFOp> {
     Type f32Ty = cloneToShapedType(operandTy, b.getF32Type());
 
     if (operandETy.getIntOrFloatBitWidth() < 32) {
-      operand = b.create<arith::ExtFOp>(f32Ty, operand);
+      operand = b.create<arith::ExtFOp>(f32Ty, operand, op.getFastmathAttr());
     } else if (operandETy.getIntOrFloatBitWidth() > 32) {
-      operand = b.create<arith::TruncFOp>(f32Ty, operand);
+      operand = b.create<arith::TruncFOp>(
+          f32Ty, operand, op.getRoundingmodeAttr(), op.getFastmathAttr());
     }
     Value f32Bits = b.create<arith::BitcastOp>(i32Ty, operand);
     Value cF32MantissaWidth = createConst(op->getLoc(), i32Ty, 23, rewriter);
@@ -409,6 +410,83 @@ struct F8E8M0TruncFOpConverter : public OpRewritePattern<arith::TruncFOp> {
   }
 };
 
+struct ScalingExtFOpConverter : public OpRewritePattern<arith::ScalingExtFOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(arith::ScalingExtFOp op,
+                                PatternRewriter &rewriter) const final {
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    Value inputOperand = op.getIn();
+    Value scaleOperand = op.getScale();
+    Type scaleTy = scaleOperand.getType();
+    Type scaleETy = getElementTypeOrSelf(scaleOperand);
+    // allow implicit exponent extraction from 16/32 bits floats
+    if (scaleETy.getIntOrFloatBitWidth() >= 16) {
+      scaleETy = b.getF8E8M0Type();
+      scaleTy = cloneToShapedType(scaleTy, scaleETy);
+      scaleOperand = b.create<arith::TruncFOp>(scaleTy, scaleOperand, nullptr,
+                                               op.getFastmathAttr());
+    }
+    if (!llvm::isa<Float8E8M0FNUType>(scaleETy)) {
+      return rewriter.notifyMatchFailure(
+          op, "scaling_extf is using scales of type which can not be converted "
+              "to f8E8M0FNU");
+    }
+    Type resultTy = op.getType();
+    // extf on scale will essentially create floating point number
+    // of type resulTy that is 2^scale and will also propagate NaNs
+    Value scaleExt =
+        b.create<arith::ExtFOp>(resultTy, scaleOperand, op.getFastmathAttr());
+    Value inputExt =
+        b.create<arith::ExtFOp>(resultTy, inputOperand, op.getFastmathAttr());
+    Value result =
+        b.create<arith::MulFOp>(inputExt, scaleExt, op.getFastmathAttr());
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+
+/*
+Expands arith.ScalingTruncFOp(in, scale) into
+  scale = arith.truncf(scale) : scaleTy -> f8E8M0FNU
+  result = arith.truncf(in / (2^scale))
+ */
+struct ScalingTruncFOpConverter
+    : public OpRewritePattern<arith::ScalingTruncFOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(arith::ScalingTruncFOp op,
+                                PatternRewriter &rewriter) const final {
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    Value inputOperand = op.getIn();
+    Value scaleOperand = op.getScale();
+    Type scaleTy = scaleOperand.getType();
+    Type scaleETy = getElementTypeOrSelf(scaleOperand);
+    // allow implicit exponent extraction from 16/32 bits floats
+    if (scaleETy.getIntOrFloatBitWidth() >= 16) {
+      scaleETy = b.getF8E8M0Type();
+      scaleTy = cloneToShapedType(scaleTy, scaleETy);
+      scaleOperand = b.create<arith::TruncFOp>(scaleTy, scaleOperand, nullptr,
+                                               op.getFastmathAttr());
+    }
+    if (!llvm::isa<Float8E8M0FNUType>(scaleETy)) {
+      return rewriter.notifyMatchFailure(
+          op, "scaling_truncf is using scales type which can not be converted "
+              "to f8E8M0FNU");
+    }
+    Type resultTy = op.getType();
+    Type inputTy = inputOperand.getType();
+    // this will create a floating point number of type
+    // inputTy that is 2^scale and will also propagate NaNs
+    scaleOperand =
+        b.create<arith::ExtFOp>(inputTy, scaleOperand, op.getFastmathAttr());
+    Value result = b.create<arith::DivFOp>(inputOperand, scaleOperand,
+                                           op.getFastmathAttr());
+    Value resultCast = b.create<arith::TruncFOp>(
+        resultTy, result, op.getRoundingmodeAttr(), op.getFastmathAttr());
+    rewriter.replaceOp(op, resultCast);
+    return success();
+  }
+};
+
 struct ArithExpandOpsPass
     : public arith::impl::ArithExpandOpsPassBase<ArithExpandOpsPass> {
   using ArithExpandOpsPassBase::ArithExpandOpsPassBase;
@@ -432,7 +510,9 @@ struct ArithExpandOpsPass
       arith::MaximumFOp,
       arith::MinimumFOp,
       arith::MaxNumFOp,
-      arith::MinNumFOp
+      arith::MinNumFOp,
+      arith::ScalingExtFOp,
+      arith::ScalingTruncFOp
     >();
 
     if (includeBf16) {
@@ -492,8 +572,15 @@ void mlir::arith::populateExpandF8E8M0Patterns(RewritePatternSet &patterns) {
       patterns.getContext());
 }
 
+void mlir::arith::populateExpandScalingExtTruncPatterns(
+    RewritePatternSet &patterns) {
+  patterns.add<ScalingExtFOpConverter, ScalingTruncFOpConverter>(
+      patterns.getContext());
+}
+
 void mlir::arith::populateArithExpandOpsPatterns(RewritePatternSet &patterns) {
   populateCeilFloorDivExpandOpsPatterns(patterns);
+  populateExpandScalingExtTruncPatterns(patterns);
   // clang-format off
   patterns.add<
     MaxMinIOpConverter<MaxSIOp, arith::CmpIPredicate::sgt>,
@@ -503,7 +590,7 @@ void mlir::arith::populateArithExpandOpsPatterns(RewritePatternSet &patterns) {
     MaximumMinimumFOpConverter<MaximumFOp, arith::CmpFPredicate::UGT>,
     MaximumMinimumFOpConverter<MinimumFOp, arith::CmpFPredicate::ULT>,
     MaxNumMinNumFOpConverter<MaxNumFOp, arith::CmpFPredicate::UGT>,
-    MaxNumMinNumFOpConverter<MinNumFOp, arith::CmpFPredicate::ULT>
+    MaxNumMinNumFOpConverter<MinNumFOp, arith::CmpFPredicate::ULT> 
    >(patterns.getContext());
   // clang-format on
 }
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp b/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp
index 72a05ff..e6c9adb 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp
@@ -528,8 +528,7 @@ chooseSpillUsingHeuristics(OverlappingRangesIterator overlappingRanges,
            a.end() < b.end();
   };
   LiveRange &latestEndingLiveRange =
-      *std::max_element(overlappingRanges.begin(), overlappingRanges.end(),
-                        isSmallerTileTypeOrEndsEarlier);
+      *llvm::max_element(overlappingRanges, isSmallerTileTypeOrEndsEarlier);
   if (!isSmallerTileTypeOrEndsEarlier(latestEndingLiveRange, *newRange))
     return &latestEndingLiveRange;
   return newRange;
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp
index e36d4b9..03af61c 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp
@@ -326,7 +326,6 @@ void spirv::UMulExtendedOp::getCanonicalizationPatterns(
 
 // The transformation is only applied if one divisor is a multiple of the other.
 
-// TODO(https://github.com/llvm/llvm-project/issues/63174): Add support for vector constants
 struct UModSimplification final : OpRewritePattern<spirv::UModOp> {
   using OpRewritePattern::OpRewritePattern;
 
@@ -336,19 +335,29 @@ struct UModSimplification final : OpRewritePattern<spirv::UModOp> {
     if (!prevUMod)
       return failure();
 
-    IntegerAttr prevValue;
-    IntegerAttr currValue;
+    TypedAttr prevValue;
+    TypedAttr currValue;
     if (!matchPattern(prevUMod.getOperand(1), m_Constant(&prevValue)) ||
         !matchPattern(umodOp.getOperand(1), m_Constant(&currValue)))
       return failure();
 
-    APInt prevConstValue = prevValue.getValue();
-    APInt currConstValue = currValue.getValue();
+    // Ensure that previous divisor is a multiple of the current divisor. If
+    // not, fail the transformation.
+    bool isApplicable = false;
+    if (auto prevInt = dyn_cast<IntegerAttr>(prevValue)) {
+      auto currInt = cast<IntegerAttr>(currValue);
+      isApplicable = prevInt.getValue().urem(currInt.getValue()) == 0;
+    } else if (auto prevVec = dyn_cast<DenseElementsAttr>(prevValue)) {
+      auto currVec = cast<DenseElementsAttr>(currValue);
+      isApplicable = llvm::all_of(llvm::zip_equal(prevVec.getValues<APInt>(),
+                                                  currVec.getValues<APInt>()),
+                                  [](const auto &pair) {
+                                    auto &[prev, curr] = pair;
+                                    return prev.urem(curr) == 0;
+                                  });
+    }
 
-    // Ensure that one divisor is a multiple of the other. If not, fail the
-    // transformation.
-    if (prevConstValue.urem(currConstValue) != 0 &&
-        currConstValue.urem(prevConstValue) != 0)
+    if (!isApplicable)
       return failure();
 
     // The transformation is safe. Replace the existing UMod operation with a
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
index 337df3a..1aff43c 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
@@ -194,8 +194,21 @@ std::optional<int64_t> CompositeType::getSizeInBytes() {
 //===----------------------------------------------------------------------===//
 
 struct spirv::detail::CooperativeMatrixTypeStorage final : TypeStorage {
+  // In the specification dimensions of the Cooperative Matrix are 32-bit
+  // integers --- the initial implementation kept those values as such. However,
+  // the `ShapedType` expects the shape to be `int64_t`. We could keep the shape
+  // as 32-bits and expose it as int64_t through `getShape`, however, this
+  // method returns an `ArrayRef`, so returning `ArrayRef<int64_t>` having two
+  // 32-bits integers would require an extra logic and storage. So, we diverge
+  // from the spec and internally represent the dimensions as 64-bit integers,
+  // so we can easily return an `ArrayRef` from `getShape` without any extra
+  // logic. Alternatively, we could store both rows and columns (both 32-bits)
+  // and shape (64-bits), assigning rows and columns to shape whenever
+  // `getShape` is called. This would be at the cost of extra logic and storage.
+  // Note: Because `ArrayRef` is returned we cannot construct an object in
+  // `getShape` on the fly.
   using KeyTy =
-      std::tuple<Type, uint32_t, uint32_t, Scope, CooperativeMatrixUseKHR>;
+      std::tuple<Type, int64_t, int64_t, Scope, CooperativeMatrixUseKHR>;
 
   static CooperativeMatrixTypeStorage *
   construct(TypeStorageAllocator &allocator, const KeyTy &key) {
@@ -204,17 +217,17 @@ struct spirv::detail::CooperativeMatrixTypeStorage final : TypeStorage {
   }
 
   bool operator==(const KeyTy &key) const {
-    return key == KeyTy(elementType, rows, columns, scope, use);
+    return key == KeyTy(elementType, shape[0], shape[1], scope, use);
   }
 
   CooperativeMatrixTypeStorage(const KeyTy &key)
-      : elementType(std::get<0>(key)), rows(std::get<1>(key)),
-        columns(std::get<2>(key)), scope(std::get<3>(key)),
+      : elementType(std::get<0>(key)),
+        shape({std::get<1>(key), std::get<2>(key)}), scope(std::get<3>(key)),
         use(std::get<4>(key)) {}
 
   Type elementType;
-  uint32_t rows;
-  uint32_t columns;
+  // [#rows, #columns]
+  std::array<int64_t, 2> shape;
   Scope scope;
   CooperativeMatrixUseKHR use;
 };
@@ -231,10 +244,18 @@ Type CooperativeMatrixType::getElementType() const {
   return getImpl()->elementType;
 }
 
-uint32_t CooperativeMatrixType::getRows() const { return getImpl()->rows; }
+uint32_t CooperativeMatrixType::getRows() const {
+  assert(getImpl()->shape[0] != ShapedType::kDynamic);
+  return static_cast<uint32_t>(getImpl()->shape[0]);
+}
 
 uint32_t CooperativeMatrixType::getColumns() const {
-  return getImpl()->columns;
+  assert(getImpl()->shape[1] != ShapedType::kDynamic);
+  return static_cast<uint32_t>(getImpl()->shape[1]);
+}
+
+ArrayRef<int64_t> CooperativeMatrixType::getShape() const {
+  return getImpl()->shape;
 }
 
 Scope CooperativeMatrixType::getScope() const { return getImpl()->scope; }
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index 045c192..52a9ced 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -1554,22 +1554,36 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
     llvm::SmallSetVector<Value, 32> escapingValues;
     SmallVector<Type> inputTypes;
     SmallVector<Type> distTypes;
+    auto collectEscapingValues = [&](Value value) {
+      if (!escapingValues.insert(value))
+        return;
+      Type distType = value.getType();
+      if (auto vecType = dyn_cast<VectorType>(distType)) {
+        AffineMap map = distributionMapFn(value);
+        distType = getDistributedType(vecType, map, warpOp.getWarpSize());
+      }
+      inputTypes.push_back(value.getType());
+      distTypes.push_back(distType);
+    };
+
     mlir::visitUsedValuesDefinedAbove(
         forOp.getBodyRegion(), [&](OpOperand *operand) {
           Operation *parent = operand->get().getParentRegion()->getParentOp();
           if (warpOp->isAncestor(parent)) {
-            if (!escapingValues.insert(operand->get()))
-              return;
-            Type distType = operand->get().getType();
-            if (auto vecType = dyn_cast<VectorType>(distType)) {
-              AffineMap map = distributionMapFn(operand->get());
-              distType = getDistributedType(vecType, map, warpOp.getWarpSize());
-            }
-            inputTypes.push_back(operand->get().getType());
-            distTypes.push_back(distType);
+            collectEscapingValues(operand->get());
           }
         });
 
+    // Any forOp result that is not already yielded by the warpOp
+    // region is also considered escaping and must be returned by the
+    // original warpOp.
+    for (OpResult forResult : forOp.getResults()) {
+      // Check if this forResult is already yielded by the yield op.
+      if (llvm::is_contained(yield->getOperands(), forResult))
+        continue;
+      collectEscapingValues(forResult);
+    }
+
     if (llvm::is_contained(distTypes, Type{}))
       return failure();
 
@@ -1609,7 +1623,12 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
                                     forOp.getResultTypes().end());
     llvm::SmallDenseMap<Value, int64_t> argIndexMapping;
     for (auto [i, retIdx] : llvm::enumerate(newRetIndices)) {
-      warpInput.push_back(newWarpOp.getResult(retIdx));
+      auto newWarpResult = newWarpOp.getResult(retIdx);
+      // Unused forOp results yielded by the warpOp region are already included
+      // in the new ForOp.
+      if (llvm::is_contained(newOperands, newWarpResult))
+        continue;
+      warpInput.push_back(newWarpResult);
       argIndexMapping[escapingValues[i]] = warpInputType.size();
       warpInputType.push_back(inputTypes[i]);
     }
diff --git a/mlir/lib/IR/Builders.cpp b/mlir/lib/IR/Builders.cpp
index 8910211..5f7bc50 100644
--- a/mlir/lib/IR/Builders.cpp
+++ b/mlir/lib/IR/Builders.cpp
@@ -34,6 +34,8 @@ Location Builder::getFusedLoc(ArrayRef<Location> locs, Attribute metadata) {
 // Types.
 //===----------------------------------------------------------------------===//
 
+FloatType Builder::getF8E8M0Type() { return Float8E8M0FNUType::get(context); }
+
 FloatType Builder::getBF16Type() { return BFloat16Type::get(context); }
 
 FloatType Builder::getF16Type() { return Float16Type::get(context); }
diff --git a/mlir/lib/IR/MLIRContext.cpp b/mlir/lib/IR/MLIRContext.cpp
index 2ab6b61..ce5e63a 100644
--- a/mlir/lib/IR/MLIRContext.cpp
+++ b/mlir/lib/IR/MLIRContext.cpp
@@ -810,7 +810,7 @@ OperationName::OperationName(StringRef name, MLIRContext *context) {
   // Acquire a writer-lock so that we can safely create the new instance.
   ScopedWriterLock lock(ctxImpl.operationInfoMutex, isMultithreadingEnabled);
 
-  auto it = ctxImpl.operations.insert({name, nullptr});
+  auto it = ctxImpl.operations.try_emplace(name);
   if (it.second) {
     auto nameAttr = StringAttr::get(context, name);
     it.first->second = std::make_unique<UnregisteredOpModel>(
diff --git a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
index e3fa7c8..73e8626 100644
--- a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
+++ b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
@@ -20,8 +20,6 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 
-extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
-
 using namespace mlir;
 
 namespace mlir {
@@ -37,10 +35,8 @@ void registerToLLVMIRTranslation() {
         // When printing LLVM IR, we should convert the module to the debug info
         // format that LLVM expects us to print.
         // See https://llvm.org/docs/RemoveDIsDebugInfo.html
-        llvm::ScopedDbgInfoFormatSetter formatSetter(*llvmModule,
-                                                     UseNewDbgInfoFormat);
-        if (UseNewDbgInfoFormat)
-          llvmModule->removeDebugIntrinsicDeclarations();
+        llvm::ScopedDbgInfoFormatSetter formatSetter(*llvmModule, true);
+        llvmModule->removeDebugIntrinsicDeclarations();
         llvmModule->print(output, nullptr);
         return success();
       },
diff --git a/mlir/lib/Target/LLVMIR/LoopAnnotationTranslation.cpp b/mlir/lib/Target/LLVMIR/LoopAnnotationTranslation.cpp
index 1dde457..54ae01c 100644
--- a/mlir/lib/Target/LLVMIR/LoopAnnotationTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/LoopAnnotationTranslation.cpp
@@ -280,7 +280,7 @@ LoopAnnotationTranslation::translateLoopAnnotation(LoopAnnotationAttr attr,
 llvm::MDNode *
 LoopAnnotationTranslation::getAccessGroup(AccessGroupAttr accessGroupAttr) {
   auto [result, inserted] =
-      accessGroupMetadataMapping.insert({accessGroupAttr, nullptr});
+      accessGroupMetadataMapping.try_emplace(accessGroupAttr);
   if (inserted)
     result->second = llvm::MDNode::getDistinct(llvmModule.getContext(), {});
   return result->second;
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index 4cc419c..22b391b 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -67,8 +67,6 @@ using namespace mlir;
 using namespace mlir::LLVM;
 using namespace mlir::LLVM::detail;
 
-extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
-
 #include "mlir/Dialect/LLVMIR/LLVMConversionEnumsToLLVM.inc"
 
 namespace {
@@ -2328,7 +2326,7 @@ mlir::translateModuleToLLVMIR(Operation *module, llvm::LLVMContext &llvmContext,
   // Once we've finished constructing elements in the module, we should convert
   // it to use the debug info format desired by LLVM.
   // See https://llvm.org/docs/RemoveDIsDebugInfo.html
-  translator.llvmModule->setIsNewDbgInfoFormat(UseNewDbgInfoFormat);
+  translator.llvmModule->setIsNewDbgInfoFormat(true);
 
   // Add the necessary debug info module flags, if they were not encoded in MLIR
   // beforehand.
diff --git a/mlir/lib/Tools/lsp-server-support/Protocol.cpp b/mlir/lib/Tools/lsp-server-support/Protocol.cpp
index e4eb251..0054dc3 100644
--- a/mlir/lib/Tools/lsp-server-support/Protocol.cpp
+++ b/mlir/lib/Tools/lsp-server-support/Protocol.cpp
@@ -289,6 +289,11 @@ bool mlir::lsp::fromJSON(const llvm::json::Value &value,
       if (codeAction->getObject("codeActionLiteralSupport"))
         result.codeActionStructure = true;
     }
+    if (auto *window = textDocument->getObject("window")) {
+      if (std::optional<bool> workDoneProgressSupport =
+              window->getBoolean("workDoneProgress"))
+        result.workDoneProgress = *workDoneProgressSupport;
+    }
   }
   return true;
 }
diff --git a/mlir/lib/Transforms/Utils/CFGToSCF.cpp b/mlir/lib/Transforms/Utils/CFGToSCF.cpp
index eefdf1d..de380fc 100644
--- a/mlir/lib/Transforms/Utils/CFGToSCF.cpp
+++ b/mlir/lib/Transforms/Utils/CFGToSCF.cpp
@@ -427,8 +427,7 @@ public:
   /// region with an instance of `returnLikeOp`s kind.
   void combineExit(Operation *returnLikeOp,
                    function_ref<Value(unsigned)> getSwitchValue) {
-    auto [iter, inserted] =
-        returnLikeToCombinedExit.insert({returnLikeOp, nullptr});
+    auto [iter, inserted] = returnLikeToCombinedExit.try_emplace(returnLikeOp);
     if (!inserted && iter->first == returnLikeOp)
       return;
 
@@ -1284,7 +1283,7 @@ FailureOr<bool> mlir::transformCFGToSCF(Region &region,
 
   DenseMap<Type, Value> typedUndefCache;
   auto getUndefValue = [&](Type type) {
-    auto [iter, inserted] = typedUndefCache.insert({type, nullptr});
+    auto [iter, inserted] = typedUndefCache.try_emplace(type);
     if (!inserted)
       return iter->second;
 
diff --git a/mlir/lib/Transforms/Utils/Inliner.cpp b/mlir/lib/Transforms/Utils/Inliner.cpp
index 54b5c78..ae34c4a 100644
--- a/mlir/lib/Transforms/Utils/Inliner.cpp
+++ b/mlir/lib/Transforms/Utils/Inliner.cpp
@@ -45,7 +45,7 @@ static void walkReferencedSymbolNodes(
 
   Operation *symbolTableOp = op->getParentOp();
   for (const SymbolTable::SymbolUse &use : *symbolUses) {
-    auto refIt = resolvedRefs.insert({use.getSymbolRef(), nullptr});
+    auto refIt = resolvedRefs.try_emplace(use.getSymbolRef());
     CallGraphNode *&node = refIt.first->second;
 
     // If this is the first instance of this reference, try to resolve a
diff --git a/mlir/test/Dialect/Arith/expand-ops.mlir b/mlir/test/Dialect/Arith/expand-ops.mlir
index 5b6badf..db1349f 100644
--- a/mlir/test/Dialect/Arith/expand-ops.mlir
+++ b/mlir/test/Dialect/Arith/expand-ops.mlir
@@ -1,4 +1,5 @@
-// RUN: mlir-opt %s -arith-expand="include-bf16=true include-f8e8m0=true" -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -arith-expand="include-bf16=true include-f8e8m0=true" -verify-diagnostics -split-input-file | FileCheck %s 
+// RUN: mlir-opt %s -arith-expand -split-input-file -verify-diagnostics | FileCheck %s --check-prefix=SCHECK
 
 // Test ceil divide with signed integer
 // CHECK-LABEL:       func @ceildivi
@@ -253,7 +254,7 @@ func.func @truncf_f32_to_f8E8M0FNU(%arg0 : f32) -> f8E8M0FNU {
     %0 = arith.truncf %arg0 : f32 to f8E8M0FNU
     return %0 : f8E8M0FNU
 }
-// CHECK-LABLE: @truncf_f32_to_f8E8M0FNU
+// CHECK-LABEL: @truncf_f32_to_f8E8M0FNU
 // CHECK: %[[BITCAST:.+]] = arith.bitcast %arg0 : f32 to i32
 // CHECK: %[[C23_i32:.+]] = arith.constant 23 : i32
 // CHECK: %[[SHRUI:.+]] = arith.shrui %[[BITCAST]], %[[C23_i32]] : i32
@@ -267,7 +268,7 @@ func.func @truncf_f16_to_f8E8M0FNU(%arg0 : f16) -> f8E8M0FNU {
     %0 = arith.truncf %arg0 : f16 to f8E8M0FNU
     return %0 : f8E8M0FNU
 }
-// CHECK-LABLE: @truncf_f16_to_f8E8M0FNU
+// CHECK-LABEL: @truncf_f16_to_f8E8M0FNU
 // CHECK: %[[EXTF:.+]] = arith.extf %arg0 : f16 to f32
 // CHECK: %[[BITCAST:.+]] = arith.bitcast %[[EXTF]] : f32 to i32
 // CHECK: %[[C23_i32:.+]] = arith.constant 23 : i32
@@ -305,9 +306,76 @@ func.func @truncf_vector_bf16_to_f8E8M0FNU(%arg0 : vector<4xbf16>) -> vector<4xf
 
 // CHECK-LABEL: @truncf_vector_bf16_to_f8E8M0FNU
 // CHECK-NOT: arith.truncf
+// CHECK: return
 
+// -----
+
+func.func @scaling_truncf_f32_to_f4E2M1FN(%arg0 : f32, %arg1: f8E8M0FNU) -> f4E2M1FN {
+    %0 = arith.scaling_truncf %arg0, %arg1 : f32, f8E8M0FNU to f4E2M1FN
+    return %0 : f4E2M1FN
+}
+
+// SCHECK-LABEL: @scaling_truncf_f32_to_f4E2M1FN
+// SCHECK: %[[SCALEF32:.+]] = arith.extf %arg1 : f8E8M0FNU to f32
+// SCHECK: %[[DIVF:.+]] = arith.divf %arg0, %[[SCALEF32]] : f32
+// SCHECK: %[[RESULT:.+]] = arith.truncf %[[DIVF]] : f32 to f4E2M1FN
+// SCHECK: return %[[RESULT]]
+
+// -----
+
+func.func @scaling_truncf_vector_f16_to_f6E3M2FN(%arg0 : vector<4xf16>, %arg1: vector<4xf8E8M0FNU>) -> vector<4xf6E3M2FN> {
+    %0 = arith.scaling_truncf %arg0, %arg1 : vector<4xf16>, vector<4xf8E8M0FNU> to vector<4xf6E3M2FN>
+    return %0 : vector<4xf6E3M2FN>
+}
+
+// SCHECK-LABEL: @scaling_truncf_vector_f16_to_f6E3M2FN
+// SCHECK: %[[SCALEF16:.+]] = arith.extf %arg1 : vector<4xf8E8M0FNU> to vector<4xf16>
+// SCHECK: %[[DIVF:.+]] = arith.divf %arg0, %[[SCALEF16]] : vector<4xf16>
+// SCHECK: %[[RESULT:.+]] = arith.truncf %[[DIVF]] : vector<4xf16> to vector<4xf6E3M2FN>
+// SCHECK: return %[[RESULT]] : vector<4xf6E3M2FN>
 
 // -----
+
+func.func @scaling_truncf_propagate_rounding_mode_fast_math(%arg0 : vector<4xf16>, %arg1: vector<4xf16>) -> vector<4xf6E3M2FN> {
+    %0 = arith.scaling_truncf %arg0, %arg1 to_nearest_even fastmath<fast> : vector<4xf16>, vector<4xf16> to vector<4xf6E3M2FN>
+    return %0 : vector<4xf6E3M2FN>
+}
+// SCHECK-LABEL: @scaling_truncf_propagate_rounding_mode_fast_math
+// SCHECK: %[[SCALEF8:.+]] = arith.truncf %arg1 fastmath<fast> : vector<4xf16> to vector<4xf8E8M0FNU>
+// SCHECK: %[[SCALEINTY:.+]] = arith.extf %[[SCALEF8]] fastmath<fast> : vector<4xf8E8M0FNU> to vector<4xf16>
+// SCHECK: %[[DIVF:.+]] = arith.divf %arg0, %[[SCALEINTY]] fastmath<fast> : vector<4xf16>
+// SCHECK: %[[TRUNCF:.+]] = arith.truncf [[_:%[a-zA-Z0-9_]+]] to_nearest_even fastmath<fast> : vector<4xf16> to vector<4xf6E3M2FN>
+// SCHECK: return %[[TRUNCF]] : vector<4xf6E3M2FN>
+
+// -----
+
+func.func @scaling_truncf_f16_to_f4E2M1FN_using_f16_scales(%arg0: f16, %arg1 : f16) -> f4E2M1FN {
+    %0 = arith.scaling_truncf %arg0, %arg1 : f16, f16 to f4E2M1FN
+    return %0 : f4E2M1FN
+}
+// SCHECK-LABEL: @scaling_truncf_f16_to_f4E2M1FN_using_f16_scales
+// SCHECK: %[[SCALETRUNCF:.+]] = arith.truncf %arg1 : f16 to f8E8M0FN
+// SCHECK: return
+
+// -----
+func.func @scaling_truncf_vector_f16_to_f4E2M1FN_using_f16_scales(%arg0: vector<4xf16>, %arg1 : vector<4xf16>) -> vector<4xf4E2M1FN> {
+    %0 = arith.scaling_truncf %arg0, %arg1 : vector<4xf16>, vector<4xf16> to vector<4xf4E2M1FN>
+    return %0 : vector<4xf4E2M1FN>
+}
+// SCHECK-LABEL: @scaling_truncf_vector_f16_to_f4E2M1FN_using_f16_scales
+// SCHECK: %[[SCALETRUNCF:.+]] = arith.truncf %arg1 : vector<4xf16> to vector<4xf8E8M0FNU>
+// SCHECK: return
+
+// -----
+
+func.func @invalid_scaling_truncf_to_f4E2M1FN(%arg0: f16, %arg1 : f8E5M2FNUZ) -> f4E2M1FN {
+    // expected-error@+1 {{failed to legalize operation 'arith.scaling_truncf' that was explicitly marked illegal}}
+    %0 = arith.scaling_truncf %arg0, %arg1 : f16, f8E5M2FNUZ to f4E2M1FN
+    return %0 : f4E2M1FN
+}
+
+// -----
+
 func.func @extf_f8E8M0FNU_to_f32(%arg0 : f8E8M0FNU) -> f32 {
     %0 = arith.extf %arg0 : f8E8M0FNU to f32
     return %0 : f32
@@ -332,7 +400,7 @@ func.func @extf_f8E8M0FNU_to_f16(%arg0 : f8E8M0FNU) -> f16 {
     return %0 : f16
 }
 
-// CHECK-LABLE: @extf_f8E8M0FNU_to_f16
+// CHECK-LABEL: @extf_f8E8M0FNU_to_f16
 // CHECK: %[[BITCAST:.+]] = arith.bitcast %arg0 : f8E8M0FNU to i8
 // CHECK-DAG: %[[CF8NAN:.+]] = arith.constant -1 : i8
 // CHECK-DAG: %[[CF32NAN:.+]] = arith.constant -1 : i32
@@ -374,7 +442,109 @@ func.func @extf_vector_f8E8M0FNU_to_bf16(%arg0 : vector<4xf8E8M0FNU>) -> vector<
 
 // CHECK-LABEL: @extf_vector_f8E8M0FNU_to_bf16
 // CHECK-NOT: arith.extf
+// CHECK: return
+
+// -----
+
+func.func @scaling_extf_to_f32(%arg0: f4E2M1FN, %arg1 : f8E8M0FNU) -> f32 {
+    %0 = arith.scaling_extf %arg0, %arg1 : f4E2M1FN, f8E8M0FNU to f32
+    return %0 : f32 
+}
+
+// SCHECK-LABEL: @scaling_extf_to_f32
+// SCHECK: %[[EXT_SCALE:.+]] = arith.extf %arg1 : f8E8M0FNU to f32
+// SCHECK: %[[EXT_INPUT:.+]] = arith.extf %arg0 : f4E2M1FN to f32
+// SCHECK: %[[RESULT:.+]] = arith.mulf %[[EXT_INPUT]], %[[EXT_SCALE]] : f32
+// SCHECK: return %[[RESULT]]
+
+// -----
+
+func.func @scaling_extf_to_f32_using_f16_scales(%arg0: f4E2M1FN, %arg1 : f16) -> f32 {
+    %0 = arith.scaling_extf %arg0, %arg1 : f4E2M1FN, f16 to f32
+    return %0 : f32 
+}
+
+// SCHECK-LABEL: @scaling_extf_to_f32_using_f16_scales
+// SCHECK: %[[TRUNCF_SCALE:.+]] = arith.truncf %arg1 : f16 to f8E8M0FNU
+// SCHECK: %[[EXT_SCALE:.+]] = arith.extf %[[TRUNCF_SCALE]] : f8E8M0FNU to f32
+// SCHECK: %[[EXT_INPUT:.+]] = arith.extf %arg0 : f4E2M1FN to f32
+// SCHECK: %[[RESULT:.+]] = arith.mulf %[[EXT_INPUT]], %[[EXT_SCALE]] : f32
+// SCHECK: return %[[RESULT]]
+
+// -----
+
+func.func @invalid_scaling_extf_to_f32(%arg0: f4E2M1FN, %arg1 : f8E5M2FNUZ) -> f32 {
+    // expected-error@+1 {{failed to legalize operation 'arith.scaling_extf' that was explicitly marked illegal}}
+    %0 = arith.scaling_extf %arg0, %arg1 : f4E2M1FN, f8E5M2FNUZ to f32
+    return %0 : f32
+}
+
+// -----
+
+func.func @scaling_extf_vector_to_f32(%arg0: vector<4xf4E2M1FN>, %arg1 : vector<4xf8E8M0FNU>) -> vector<4xf32> {
+    %0 = arith.scaling_extf %arg0, %arg1 : vector<4xf4E2M1FN>, vector<4xf8E8M0FNU> to vector<4xf32>
+    return %0 : vector<4xf32>
+}
+
+// SCHECK-LABEL: @scaling_extf_vector_to_f32
+// SCHECK: %[[EXT_SCALE:.+]] = arith.extf %arg1 : vector<4xf8E8M0FNU> to vector<4xf32>
+// SCHECK: %[[EXT_INPUT:.+]] = arith.extf %arg0 : vector<4xf4E2M1FN> to vector<4xf32>
+// SCHECK: %[[RESULT:.+]] = arith.mulf %[[EXT_INPUT]], %[[EXT_SCALE]] : vector<4xf32> 
+// SCHECK: return %[[RESULT]]
+
+// -----
+
+func.func @scaling_extf_vector_to_f16(%arg0: vector<4xf4E2M1FN>, %arg1 : vector<4xf8E8M0FNU>) -> vector<4xf16> {
+    %0 = arith.scaling_extf %arg0, %arg1 : vector<4xf4E2M1FN>, vector<4xf8E8M0FNU> to vector<4xf16>
+    return %0 : vector<4xf16>
+}
+
+// SCHECK-LABEL: @scaling_extf_vector_to_f16
+// SCHECK: %[[EXT_SCALE:.+]] = arith.extf %arg1 : vector<4xf8E8M0FNU> to vector<4xf16>
+// SCHECK: %[[EXT_INPUT:.+]] = arith.extf %arg0 : vector<4xf4E2M1FN> to vector<4xf16>
+// SCHECK: %[[RESULT:.+]] = arith.mulf %[[EXT_INPUT]], %[[EXT_SCALE]] : vector<4xf16> 
+// SCHECK: return %[[RESULT]]
+
+// -----
+
+func.func @scaling_extf_vector_to_bf16(%arg0: vector<4xf4E2M1FN>, %arg1 : vector<4xf8E8M0FNU>) -> vector<4xbf16> {
+    %0 = arith.scaling_extf %arg0, %arg1 : vector<4xf4E2M1FN>, vector<4xf8E8M0FNU> to vector<4xbf16>
+    return %0 : vector<4xbf16>
+}
+
+// SCHECK-LABEL: @scaling_extf_vector_to_bf16
+// SCHECK: %[[EXT_SCALE:.+]] = arith.extf %arg1 : vector<4xf8E8M0FNU> to vector<4xbf16>
+// SCHECK: %[[EXT_INPUT:.+]] = arith.extf %arg0 : vector<4xf4E2M1FN> to vector<4xbf16>
+// SCHECK: %[[RESULT:.+]] = arith.mulf %[[EXT_INPUT]], %[[EXT_SCALE]] : vector<4xbf16> 
+// SCHECK: return %[[RESULT]]
+
+// -----
+
+func.func @scaling_extf_vector_to_f32_using_f16_scales(%arg0: vector<4xf4E2M1FN>, %arg1 : vector<4xf16>) -> vector<4xf32> {
+    %0 = arith.scaling_extf %arg0, %arg1 : vector<4xf4E2M1FN>, vector<4xf16> to vector<4xf32>
+    return %0 : vector<4xf32>
+}
+
+// SCHECK-LABEL: @scaling_extf_vector_to_f32_using_f16_scales
+// SCHECK: %[[TRUNCF_SCALE:.+]] = arith.truncf %arg1 : vector<4xf16> to vector<4xf8E8M0FNU>
+// SCHECK: %[[EXT_SCALE:.+]] = arith.extf %[[TRUNCF_SCALE]] : vector<4xf8E8M0FNU> to vector<4xf32>
+// SCHECK: %[[EXT_INPUT:.+]] = arith.extf %arg0 : vector<4xf4E2M1FN> to vector<4xf32>
+// SCHECK: %[[RESULT:.+]] = arith.mulf %[[EXT_INPUT]], %[[EXT_SCALE]] : vector<4xf32>
+// SCHECK: return %[[RESULT]]
+
+// -----
+
+func.func @scaling_extf_vector_to_f32_using_f16_scales_fastmath(%arg0: vector<4xf4E2M1FN>, %arg1 : vector<4xf16>) -> vector<4xf32> {
+    %0 = arith.scaling_extf %arg0, %arg1 fastmath<fast> : vector<4xf4E2M1FN>, vector<4xf16> to vector<4xf32>
+    return %0 : vector<4xf32>
+}
 
+// SCHECK-LABEL: @scaling_extf_vector_to_f32_using_f16_scales_fastmath
+// SCHECK: %[[TRUNCF_SCALE:.+]] = arith.truncf %arg1 fastmath<fast> : vector<4xf16> to vector<4xf8E8M0FNU>
+// SCHECK: %[[EXT_SCALE:.+]] = arith.extf %[[TRUNCF_SCALE]] fastmath<fast> : vector<4xf8E8M0FNU> to vector<4xf32>
+// SCHECK: %[[EXT_INPUT:.+]] = arith.extf %arg0 fastmath<fast> : vector<4xf4E2M1FN> to vector<4xf32>
+// SCHECK: %[[RESULT:.+]] = arith.mulf %[[EXT_INPUT]], %[[EXT_SCALE]] fastmath<fast> : vector<4xf32>
+// SCHECK: return %[[RESULT]]
 
 // -----
 
diff --git a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
index 0be0479..680a7e1 100644
--- a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
@@ -255,6 +255,54 @@ func.func @coshvec(%arg0 : vector<3xf16>) -> () {
 }
 
 //===----------------------------------------------------------------------===//
+// spirv.GL.Asinh
+//===----------------------------------------------------------------------===//
+
+func.func @asinh(%arg0 : f32) -> () {
+  // CHECK: spirv.GL.Asinh {{%.*}} : f32
+  %2 = spirv.GL.Asinh %arg0 : f32
+  return
+}
+
+func.func @asinhvec(%arg0 : vector<3xf16>) -> () {
+  // CHECK: spirv.GL.Asinh {{%.*}} : vector<3xf16>
+  %2 = spirv.GL.Asinh %arg0 : vector<3xf16>
+  return
+}
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.Acosh
+//===----------------------------------------------------------------------===//
+
+func.func @acosh(%arg0 : f32) -> () {
+  // CHECK: spirv.GL.Acosh {{%.*}} : f32
+  %2 = spirv.GL.Acosh %arg0 : f32
+  return
+}
+
+func.func @acoshvec(%arg0 : vector<3xf16>) -> () {
+  // CHECK: spirv.GL.Acosh {{%.*}} : vector<3xf16>
+  %2 = spirv.GL.Acosh %arg0 : vector<3xf16>
+  return
+}
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.Atanh
+//===----------------------------------------------------------------------===//
+
+func.func @atanh(%arg0 : f32) -> () {
+  // CHECK: spirv.GL.Atanh {{%.*}} : f32
+  %2 = spirv.GL.Atanh %arg0 : f32
+  return
+}
+
+func.func @atanhvec(%arg0 : vector<3xf16>) -> () {
+  // CHECK: spirv.GL.Atanh {{%.*}} : vector<3xf16>
+  %2 = spirv.GL.Atanh %arg0 : vector<3xf16>
+  return
+}
+
+//===----------------------------------------------------------------------===//
 // spirv.GL.Pow
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/test/Dialect/SPIRV/IR/khr-cooperative-matrix-ops.mlir b/mlir/test/Dialect/SPIRV/IR/khr-cooperative-matrix-ops.mlir
index d3e1dbc..8733ff9 100644
--- a/mlir/test/Dialect/SPIRV/IR/khr-cooperative-matrix-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/khr-cooperative-matrix-ops.mlir
@@ -524,7 +524,7 @@ spirv.func @matrix_times_scalar(%a: !matA_f32, %b: f32) "None" {
 
 spirv.func @iadd(%a: !spirv.coopmatrix<2x2xi32, Subgroup, MatrixA>,
                  %b: !spirv.coopmatrix<2x2xi32, Subgroup, MatrixB>) "None" {
-  // expected-error @+1 {{op requires the same type for all operands and results}}
+  // expected-error @+1 {{failed to verify that all of {operand1, operand2, result} have same type}}
   %q = "spirv.IAdd"(%a, %b) :
     (!spirv.coopmatrix<2x2xi32, Subgroup, MatrixA>, !spirv.coopmatrix<2x2xi32, Subgroup, MatrixB>)
     -> !spirv.coopmatrix<2x2xi32, Subgroup, MatrixA>
@@ -535,7 +535,7 @@ spirv.func @iadd(%a: !spirv.coopmatrix<2x2xi32, Subgroup, MatrixA>,
 
 spirv.func @fadd(%a: !spirv.coopmatrix<2x2xf32, Subgroup, MatrixA>,
                  %b: !spirv.coopmatrix<2x2xf32, Subgroup, MatrixAcc>) "None" {
-  // expected-error @+1 {{op requires the same type for all operands and results}}
+  // expected-error @+1 {{failed to verify that all of {operand1, operand2, result} have same type}}
   %q = "spirv.FAdd"(%a, %b) :
     (!spirv.coopmatrix<2x2xf32, Subgroup, MatrixA>, !spirv.coopmatrix<2x2xf32, Subgroup, MatrixAcc>)
     -> !spirv.coopmatrix<2x2xf32, Subgroup, MatrixAcc>
diff --git a/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir b/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir
index 0fd6c18..722c275 100644
--- a/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir
+++ b/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir
@@ -967,17 +967,17 @@ func.func @umod_fold(%arg0: i32) -> (i32, i32) {
   return %0, %1: i32, i32
 }
 
-// CHECK-LABEL: @umod_fail_vector_fold
+// CHECK-LABEL: @umod_vector_fold
 // CHECK-SAME: (%[[ARG:.*]]: vector<4xi32>)
-func.func @umod_fail_vector_fold(%arg0: vector<4xi32>) -> (vector<4xi32>, vector<4xi32>) {
+func.func @umod_vector_fold(%arg0: vector<4xi32>) -> (vector<4xi32>, vector<4xi32>) {
   // CHECK: %[[CONST4:.*]] = spirv.Constant dense<4> : vector<4xi32>
   // CHECK: %[[CONST32:.*]] = spirv.Constant dense<32> : vector<4xi32>
   %const1 = spirv.Constant dense<32> : vector<4xi32>
   %0 = spirv.UMod %arg0, %const1 : vector<4xi32>
-  // CHECK: %[[UMOD0:.*]] = spirv.UMod %[[ARG]], %[[CONST32]]
   %const2 = spirv.Constant dense<4> : vector<4xi32>
   %1 = spirv.UMod %0, %const2 : vector<4xi32>
-  // CHECK: %[[UMOD1:.*]] = spirv.UMod %[[UMOD0]], %[[CONST4]]
+  // CHECK: %[[UMOD0:.*]] = spirv.UMod %[[ARG]], %[[CONST32]]
+  // CHECK: %[[UMOD1:.*]] = spirv.UMod %[[ARG]], %[[CONST4]]
   // CHECK: return %[[UMOD0]], %[[UMOD1]]
   return %0, %1: vector<4xi32>, vector<4xi32>
 } 
@@ -996,9 +996,9 @@ func.func @umod_fold_same_divisor(%arg0: i32) -> (i32, i32) {
   return %0, %1: i32, i32
 }
 
-// CHECK-LABEL: @umod_fail_fold
+// CHECK-LABEL: @umod_fail_1_fold
 // CHECK-SAME: (%[[ARG:.*]]: i32)
-func.func @umod_fail_fold(%arg0: i32) -> (i32, i32) {
+func.func @umod_fail_1_fold(%arg0: i32) -> (i32, i32) {
   // CHECK: %[[CONST5:.*]] = spirv.Constant 5
   // CHECK: %[[CONST32:.*]] = spirv.Constant 32
   %const1 = spirv.Constant 32 : i32
@@ -1011,6 +1011,51 @@ func.func @umod_fail_fold(%arg0: i32) -> (i32, i32) {
   return %0, %1: i32, i32
 }
 
+// CHECK-LABEL: @umod_fail_2_fold
+// CHECK-SAME: (%[[ARG:.*]]: i32)
+func.func @umod_fail_2_fold(%arg0: i32) -> (i32, i32) {
+  // CHECK: %[[CONST32:.*]] = spirv.Constant 32
+  // CHECK: %[[CONST4:.*]] = spirv.Constant 4
+  %const1 = spirv.Constant 4 : i32
+  %0 = spirv.UMod %arg0, %const1 : i32
+  // CHECK: %[[UMOD0:.*]] = spirv.UMod %[[ARG]], %[[CONST4]]
+  %const2 = spirv.Constant 32 : i32
+  %1 = spirv.UMod %0, %const2 : i32
+  // CHECK: %[[UMOD1:.*]] = spirv.UMod %[[UMOD0]], %[[CONST32]]
+  // CHECK: return %[[UMOD0]], %[[UMOD1]]
+  return %0, %1: i32, i32
+}
+
+// CHECK-LABEL: @umod_vector_fail_1_fold
+// CHECK-SAME: (%[[ARG:.*]]: vector<4xi32>)
+func.func @umod_vector_fail_1_fold(%arg0: vector<4xi32>) -> (vector<4xi32>, vector<4xi32>) {
+  // CHECK: %[[CONST9:.*]] = spirv.Constant dense<9> : vector<4xi32>
+  // CHECK: %[[CONST64:.*]] = spirv.Constant dense<64> : vector<4xi32>
+  %const1 = spirv.Constant dense<64> : vector<4xi32>
+  %0 = spirv.UMod %arg0, %const1 : vector<4xi32>
+  // CHECK: %[[UMOD0:.*]] = spirv.UMod %[[ARG]], %[[CONST64]]
+  %const2 = spirv.Constant dense<9> : vector<4xi32>
+  %1 = spirv.UMod %0, %const2 : vector<4xi32>
+  // CHECK: %[[UMOD1:.*]] = spirv.UMod %[[UMOD0]], %[[CONST9]]
+  // CHECK: return %[[UMOD0]], %[[UMOD1]]
+  return %0, %1: vector<4xi32>, vector<4xi32>
+}
+
+// CHECK-LABEL: @umod_vector_fail_2_fold
+// CHECK-SAME: (%[[ARG:.*]]: vector<4xi32>)
+func.func @umod_vector_fail_2_fold(%arg0: vector<4xi32>) -> (vector<4xi32>, vector<4xi32>) {
+  // CHECK: %[[CONST32:.*]] = spirv.Constant dense<32> : vector<4xi32>
+  // CHECK: %[[CONST4:.*]] = spirv.Constant dense<4> : vector<4xi32>
+  %const1 = spirv.Constant dense<4> : vector<4xi32>
+  %0 = spirv.UMod %arg0, %const1 : vector<4xi32>
+  // CHECK: %[[UMOD0:.*]] = spirv.UMod %[[ARG]], %[[CONST4]]
+  %const2 = spirv.Constant dense<32> : vector<4xi32>
+  %1 = spirv.UMod %0, %const2 : vector<4xi32>
+  // CHECK: %[[UMOD1:.*]] = spirv.UMod %[[UMOD0]], %[[CONST32]]
+  // CHECK: return %[[UMOD0]], %[[UMOD1]]
+  return %0, %1: vector<4xi32>, vector<4xi32>
+}
+
 // -----
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
index 38771f2..6c7ac7a 100644
--- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
+++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
@@ -585,6 +585,42 @@ func.func @warp_scf_for_multiple_yield(%arg0: index, %arg1: memref<?xf32>, %arg2
 }
 
 // -----
+// CHECK-PROP-LABEL: func.func @warp_scf_for_unused_yield(
+//       CHECK-PROP: %[[W0:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) {
+//       CHECK-PROP: %[[INI0:.*]] = "some_def"() : () -> vector<128xf32>
+//       CHECK-PROP: %[[INI1:.*]] = "some_def"() : () -> vector<128xf32>
+//       CHECK-PROP: gpu.yield %[[INI0]], %[[INI1]] : vector<128xf32>, vector<128xf32>
+//       CHECK-PROP: }
+//       CHECK-PROP: %[[F:.*]]:2 = scf.for %{{.*}} iter_args(%{{.*}} = %[[W0]]#0, %{{.*}} = %[[W0]]#1) -> (vector<4xf32>, vector<4xf32>) {
+//       CHECK-PROP: %[[W1:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%{{.*}} : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>, vector<4xf32>) {
+//       CHECK-PROP: %[[ACC0:.*]] = "some_def"(%{{.*}}) : (vector<128xf32>, index) -> vector<128xf32>
+//       CHECK-PROP: %[[ACC1:.*]] = "some_def"(%{{.*}}) : (index, vector<128xf32>, vector<128xf32>) -> vector<128xf32>
+//       CHECK-PROP: gpu.yield %[[ACC1]], %[[ACC0]] : vector<128xf32>, vector<128xf32>
+//       CHECK-PROP: }
+//       CHECK-PROP: scf.yield %[[W1]]#0, %[[W1]]#1 : vector<4xf32>, vector<4xf32>
+//       CHECK-PROP: }
+//       CHECK-PROP: "some_use"(%[[F]]#0) : (vector<4xf32>) -> ()
+func.func @warp_scf_for_unused_yield(%arg0: index) {
+  %c128 = arith.constant 128 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %0 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) {
+    %ini = "some_def"() : () -> (vector<128xf32>)
+    %ini1 = "some_def"() : () -> (vector<128xf32>)
+    %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini, %arg5 = %ini1) -> (vector<128xf32>, vector<128xf32>) {
+      %add = arith.addi %arg3, %c1 : index
+      %1  = "some_def"(%arg5, %add) : (vector<128xf32>, index) -> (vector<128xf32>)
+      %acc = "some_def"(%add, %arg4, %1) : (index, vector<128xf32>, vector<128xf32>) -> (vector<128xf32>)
+      scf.yield %acc, %1 : vector<128xf32>, vector<128xf32>
+    }
+    gpu.yield %3#0 : vector<128xf32>
+  }
+  "some_use"(%0) : (vector<4xf32>) -> ()
+  return
+}
+
+
+// -----
 
 // CHECK-PROP-LABEL: func @vector_reduction(
 //  CHECK-PROP-SAME:     %[[laneid:.*]]: index)
diff --git a/mlir/test/Target/SPIRV/gl-ops.mlir b/mlir/test/Target/SPIRV/gl-ops.mlir
index 7f97712..28e5a1f 100644
--- a/mlir/test/Target/SPIRV/gl-ops.mlir
+++ b/mlir/test/Target/SPIRV/gl-ops.mlir
@@ -34,6 +34,12 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %15 = spirv.GL.FMix %arg0 : f32, %arg1 : f32, %arg0 : f32 -> f32
     // CHECK: {{%.*}} = spirv.GL.Fract {{%.*}} : f32
     %16 = spirv.GL.Fract %arg0 : f32
+    // CHECK: {{%.*}} = spirv.GL.Asinh {{%.*}} : f32
+    %17 = spirv.GL.Asinh %arg0 : f32
+    // CHECK: {{%.*}} = spirv.GL.Acosh {{%.*}} : f32
+    %18 = spirv.GL.Acosh %arg0 : f32
+    // CHECK: {{%.*}} = spirv.GL.Atanh {{%.*}} : f32
+    %19 = spirv.GL.Atanh %arg0 : f32
     spirv.Return
   }
 
diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index 09eae0f..0a441c3 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -41,6 +41,8 @@ endif()
 
 set(LLVM_COMMON_CMAKE_UTILS ${CMAKE_CURRENT_SOURCE_DIR}/../cmake)
 
+option(OFFLOAD_INCLUDE_TESTS "Generate and build offload tests." ${LLVM_INCLUDE_TESTS})
+
 # Add path for custom modules
 list(INSERT CMAKE_MODULE_PATH 0
   "${CMAKE_CURRENT_SOURCE_DIR}/cmake"
@@ -376,15 +378,17 @@ add_subdirectory(libomptarget)
 add_subdirectory(liboffload)
 
 # Add tests.
-add_subdirectory(test)
+if(OFFLOAD_INCLUDE_TESTS)
+  add_subdirectory(test)
 
-# Add unit tests if GMock/GTest is present
-if(NOT LLVM_THIRD_PARTY_DIR)
-  set(LLVM_THIRD_PARTY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../third-party")
-endif()
-if(EXISTS ${LLVM_THIRD_PARTY_DIR}/unittest AND NOT TARGET llvm_gtest)
-  add_subdirectory(${LLVM_THIRD_PARTY_DIR}/unittest ${CMAKE_CURRENT_BINARY_DIR}/third-party/unittest)
-endif()
-if(TARGET llvm_gtest)
-  add_subdirectory(unittests)
+  # Add unit tests if GMock/GTest is present
+  if(NOT LLVM_THIRD_PARTY_DIR)
+    set(LLVM_THIRD_PARTY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../third-party")
+  endif()
+  if(EXISTS ${LLVM_THIRD_PARTY_DIR}/unittest AND NOT TARGET llvm_gtest)
+    add_subdirectory(${LLVM_THIRD_PARTY_DIR}/unittest ${CMAKE_CURRENT_BINARY_DIR}/third-party/unittest)
+  endif()
+  if(TARGET llvm_gtest)
+    add_subdirectory(unittests)
+  endif()
 endif()
diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
index 34ca04d..613d026 100644
--- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
+++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
@@ -124,9 +124,6 @@
 /* Define to 1 if you have the DIA SDK installed, and to 0 if you don't. */
 #define LLVM_ENABLE_DIA_SDK 0
 
-/* Define if plugins enabled */
-#define LLVM_ENABLE_PLUGINS
-
 /* Define if building LLVM with LLVM_ENABLE_TELEMETRY */
 #define LLVM_ENABLE_TELEMETRY 1
author	Aiden Grossman <aidengrossman@google.com>	2025-06-09 22:16:17 -0700
committer	Aiden Grossman <agrossman154@yahoo.com>	2025-06-09 22:16:17 -0700
commit	80e157abcc3ad4288c78d7f65d3308d46a6ce925 (patch)
tree	010ffc72e73a1434300f41fbf53225a4eb454641
parent	26fdf090280a2e055ed2da6714671d042bb15be0 (diff)
parent	591678bebd09f5d9c5781dd7a9cbd19cb8cd532c (diff)
download	llvm-users/boomanaiden154/main.ci-migrate-to-runtimes-build.zip llvm-users/boomanaiden154/main.ci-migrate-to-runtimes-build.tar.gz llvm-users/boomanaiden154/main.ci-migrate-to-runtimes-build.tar.bz2