[𝘀𝗽𝗿] changes introduced through rebaseusers/vitalybuka/spr/main.nfctsan-eliminate-a-few-macros

Created using spr 1.3.4 [skip ci]
author: Vitaly Buka <vitalybuka@google.com> 2024-11-05 09:21:53 -0800
committer: Vitaly Buka <vitalybuka@google.com> 2024-11-05 09:21:53 -0800
commit: f27002c4fac12dcf793fab4f6c434e14ca31bdcc (patch)
tree: fd249e8e56313dfcbed2a72a896ab60b40cef976
parent: f36f3721c603c122bdc5092276efef00a9a77766 (diff)
parent: 1e50958399e0bb2a558a5d5806a61da9b2ef9e74 (diff)
download: llvm-users/vitalybuka/spr/main.nfctsan-eliminate-a-few-macros.zip
llvm-users/vitalybuka/spr/main.nfctsan-eliminate-a-few-macros.tar.gz
llvm-users/vitalybuka/spr/main.nfctsan-eliminate-a-few-macros.tar.bz2
353 files changed, 8591 insertions, 5888 deletions
diff --git a/clang-tools-extra/clang-query/Query.cpp b/clang-tools-extra/clang-query/Query.cpp
index 282d136..382aa5d 100644
--- a/clang-tools-extra/clang-query/Query.cpp
+++ b/clang-tools-extra/clang-query/Query.cpp
@@ -44,7 +44,9 @@ bool HelpQuery::run(llvm::raw_ostream &OS, QuerySession &QS) const {
         "  set bind-root (true|false)        "
         "Set whether to bind the root matcher to \"root\".\n"
         "  set print-matcher (true|false)    "
-        "Set whether to print the current matcher,\n"
+        "Set whether to print the current matcher.\n"
+        "  set enable-profile (true|false)   "
+        "Set whether to enable matcher profiling.\n"
         "  set traversal <kind>              "
         "Set traversal kind of clang-query session. Available kinds are:\n"
         "    AsIs                            "
@@ -82,10 +84,24 @@ namespace {
 
 struct CollectBoundNodes : MatchFinder::MatchCallback {
   std::vector<BoundNodes> &Bindings;
-  CollectBoundNodes(std::vector<BoundNodes> &Bindings) : Bindings(Bindings) {}
+  StringRef Unit;
+  CollectBoundNodes(std::vector<BoundNodes> &Bindings, StringRef Unit)
+      : Bindings(Bindings), Unit(Unit) {}
   void run(const MatchFinder::MatchResult &Result) override {
     Bindings.push_back(Result.Nodes);
   }
+  StringRef getID() const override { return Unit; }
+};
+
+struct QueryProfiler {
+  llvm::StringMap<llvm::TimeRecord> Records;
+
+  ~QueryProfiler() {
+    llvm::TimerGroup TG("clang-query", "clang-query matcher profiling",
+                        Records);
+    TG.print(llvm::errs());
+    llvm::errs().flush();
+  }
 };
 
 } // namespace
@@ -93,8 +109,19 @@ struct CollectBoundNodes : MatchFinder::MatchCallback {
 bool MatchQuery::run(llvm::raw_ostream &OS, QuerySession &QS) const {
   unsigned MatchCount = 0;
 
+  std::optional<QueryProfiler> Profiler;
+  if (QS.EnableProfile)
+    Profiler.emplace();
+
   for (auto &AST : QS.ASTs) {
-    MatchFinder Finder;
+    ast_matchers::MatchFinder::MatchFinderOptions FinderOptions;
+    std::optional<llvm::StringMap<llvm::TimeRecord>> Records;
+    if (QS.EnableProfile) {
+      Records.emplace();
+      FinderOptions.CheckProfiling.emplace(*Records);
+    }
+
+    MatchFinder Finder(FinderOptions);
     std::vector<BoundNodes> Matches;
     DynTypedMatcher MaybeBoundMatcher = Matcher;
     if (QS.BindRoot) {
@@ -102,7 +129,8 @@ bool MatchQuery::run(llvm::raw_ostream &OS, QuerySession &QS) const {
       if (M)
         MaybeBoundMatcher = *M;
     }
-    CollectBoundNodes Collect(Matches);
+    StringRef OrigSrcName = AST->getOriginalSourceFileName();
+    CollectBoundNodes Collect(Matches, OrigSrcName);
     if (!Finder.addDynamicMatcher(MaybeBoundMatcher, &Collect)) {
       OS << "Not a valid top-level matcher.\n";
       return false;
@@ -111,6 +139,8 @@ bool MatchQuery::run(llvm::raw_ostream &OS, QuerySession &QS) const {
     ASTContext &Ctx = AST->getASTContext();
     Ctx.getParentMapContext().setTraversalKind(QS.TK);
     Finder.matchAST(Ctx);
+    if (QS.EnableProfile)
+      Profiler->Records[OrigSrcName] += (*Records)[OrigSrcName];
 
     if (QS.PrintMatcher) {
       SmallVector<StringRef, 4> Lines;
diff --git a/clang-tools-extra/clang-query/QueryParser.cpp b/clang-tools-extra/clang-query/QueryParser.cpp
index 97cb264..1d5ec28 100644
--- a/clang-tools-extra/clang-query/QueryParser.cpp
+++ b/clang-tools-extra/clang-query/QueryParser.cpp
@@ -182,6 +182,7 @@ enum ParsedQueryVariable {
   PQV_Output,
   PQV_BindRoot,
   PQV_PrintMatcher,
+  PQV_EnableProfile,
   PQV_Traversal
 };
 
@@ -285,6 +286,7 @@ QueryRef QueryParser::doParse() {
             .Case("output", PQV_Output)
             .Case("bind-root", PQV_BindRoot)
             .Case("print-matcher", PQV_PrintMatcher)
+            .Case("enable-profile", PQV_EnableProfile)
             .Case("traversal", PQV_Traversal)
             .Default(PQV_Invalid);
     if (VarStr.empty())
@@ -303,6 +305,9 @@ QueryRef QueryParser::doParse() {
     case PQV_PrintMatcher:
       Q = parseSetBool(&QuerySession::PrintMatcher);
       break;
+    case PQV_EnableProfile:
+      Q = parseSetBool(&QuerySession::EnableProfile);
+      break;
     case PQV_Traversal:
       Q = parseSetTraversalKind(&QuerySession::TK);
       break;
diff --git a/clang-tools-extra/clang-query/QuerySession.h b/clang-tools-extra/clang-query/QuerySession.h
index 31a4900..c7d5a64 100644
--- a/clang-tools-extra/clang-query/QuerySession.h
+++ b/clang-tools-extra/clang-query/QuerySession.h
@@ -26,7 +26,7 @@ public:
   QuerySession(llvm::ArrayRef<std::unique_ptr<ASTUnit>> ASTs)
       : ASTs(ASTs), PrintOutput(false), DiagOutput(true),
         DetailedASTOutput(false), BindRoot(true), PrintMatcher(false),
-        Terminate(false), TK(TK_AsIs) {}
+        EnableProfile(false), Terminate(false), TK(TK_AsIs) {}
 
   llvm::ArrayRef<std::unique_ptr<ASTUnit>> ASTs;
 
@@ -36,6 +36,7 @@ public:
 
   bool BindRoot;
   bool PrintMatcher;
+  bool EnableProfile;
   bool Terminate;
 
   TraversalKind TK;
diff --git a/clang-tools-extra/clangd/Protocol.cpp b/clang-tools-extra/clangd/Protocol.cpp
index 295ccd2..761f968 100644
--- a/clang-tools-extra/clangd/Protocol.cpp
+++ b/clang-tools-extra/clangd/Protocol.cpp
@@ -504,6 +504,16 @@ bool fromJSON(const llvm::json::Value &Params, ClientCapabilities &R,
                   P.field("offsetEncoding")))
       return false;
   }
+
+  if (auto *Experimental = O->getObject("experimental")) {
+    if (auto *TextDocument = Experimental->getObject("textDocument")) {
+      if (auto *Completion = TextDocument->getObject("completion")) {
+        if (auto EditsNearCursor = Completion->getBoolean("editsNearCursor"))
+          R.CompletionFixes |= *EditsNearCursor;
+      }
+    }
+  }
+
   return true;
 }
 
diff --git a/clang-tools-extra/clangd/TidyProvider.cpp b/clang-tools-extra/clangd/TidyProvider.cpp
index a87238e..2ac1232 100644
--- a/clang-tools-extra/clangd/TidyProvider.cpp
+++ b/clang-tools-extra/clangd/TidyProvider.cpp
@@ -46,7 +46,7 @@ public:
         [this](std::optional<llvm::StringRef> Data) {
           Value.reset();
           if (Data && !Data->empty()) {
-            tidy::DiagCallback Diagnostics = [](const llvm::SMDiagnostic &D) {
+            auto Diagnostics = [](const llvm::SMDiagnostic &D) {
               switch (D.getKind()) {
               case llvm::SourceMgr::DK_Error:
                 elog("tidy-config error at {0}:{1}:{2}: {3}", D.getFilename(),
@@ -149,7 +149,7 @@ static void mergeCheckList(std::optional<std::string> &Checks,
   *Checks = llvm::join_items(",", *Checks, List);
 }
 
-TidyProviderRef provideEnvironment() {
+TidyProvider provideEnvironment() {
   static const std::optional<std::string> User = [] {
     std::optional<std::string> Ret = llvm::sys::Process::GetEnv("USER");
 #ifdef _WIN32
@@ -167,7 +167,7 @@ TidyProviderRef provideEnvironment() {
   return [](tidy::ClangTidyOptions &, llvm::StringRef) {};
 }
 
-TidyProviderRef provideDefaultChecks() {
+TidyProvider provideDefaultChecks() {
   // These default checks are chosen for:
   //  - low false-positive rate
   //  - providing a lot of value
@@ -251,7 +251,7 @@ TidyProvider disableUnusableChecks(llvm::ArrayRef<std::string> ExtraBadChecks) {
   };
 }
 
-TidyProviderRef provideClangdConfig() {
+TidyProvider provideClangdConfig() {
   return [](tidy::ClangTidyOptions &Opts, llvm::StringRef) {
     const auto &CurTidyConfig = Config::current().Diagnostics.ClangTidy;
     if (!CurTidyConfig.Checks.empty())
diff --git a/clang-tools-extra/clangd/TidyProvider.h b/clang-tools-extra/clangd/TidyProvider.h
index 7d849d34..8424f5e 100644
--- a/clang-tools-extra/clangd/TidyProvider.h
+++ b/clang-tools-extra/clangd/TidyProvider.h
@@ -30,11 +30,11 @@ using TidyProviderRef = llvm::function_ref<void(tidy::ClangTidyOptions &,
 TidyProvider combine(std::vector<TidyProvider> Providers);
 
 /// Provider that just sets the defaults.
-TidyProviderRef provideEnvironment();
+TidyProvider provideEnvironment();
 
 /// Provider that will enable a nice set of default checks if none are
 /// specified.
-TidyProviderRef provideDefaultChecks();
+TidyProvider provideDefaultChecks();
 
 /// Provider the enables a specific set of checks and warnings as errors.
 TidyProvider addTidyChecks(llvm::StringRef Checks,
@@ -51,7 +51,7 @@ disableUnusableChecks(llvm::ArrayRef<std::string> ExtraBadChecks = {});
 TidyProvider provideClangTidyFiles(ThreadsafeFS &);
 
 // Provider that uses clangd configuration files.
-TidyProviderRef provideClangdConfig();
+TidyProvider provideClangdConfig();
 
 tidy::ClangTidyOptions getTidyOptionsForFile(TidyProviderRef Provider,
                                              llvm::StringRef Filename);
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 51ba157..abcdcc2 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -98,7 +98,7 @@ Improvements to clang-doc
 Improvements to clang-query
 ---------------------------
 
-The improvements are...
+- Added `set enable-profile true/false` command for basic matcher profiling.
 
 Improvements to clang-tidy
 --------------------------
diff --git a/clang/Maintainers.rst b/clang/Maintainers.rst
index 694ebc6..26495f3 100644
--- a/clang/Maintainers.rst
+++ b/clang/Maintainers.rst
@@ -68,6 +68,15 @@ Sema
 | Sirraide
 | aeternalmail\@gmail.com (email), Sirraide (GitHub), Ætérnal (Discord), Sirraide (Discourse)
 
+| Mariya Podchishchaeva
+| mariya.podchishchaeva\@intel.com (email), Fznamznon (GitHub), fznamznon (Discord), Fznamznon (Discourse)
+
+
+Recovery AST
+~~~~~~~~~~~~
+| Haojian Wu
+| hokein.wu\@gmail.com (email), hokein (Phabricator), hokein (GitHub), hokein (Discourse)
+
 
 Experimental new constant interpreter
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -132,6 +141,15 @@ Compiler options
 | jan_svoboda\@apple.com (email), jansvoboda11 (Phabricator), jansvoboda11 (GitHub)
 
 
+API Notes
+~~~~~~~~~~~~~~~~
+| Egor Zhdan
+| e_zhdan\@apple.com (email), egorzhdan (GitHub), egor.zhdan (Discourse)
+
+| Saleem Abdulrasool
+| compnerd\@compnerd.org (email), compnerd (GitHub), compnerd (Discourse)
+
+
 OpenBSD driver
 ~~~~~~~~~~~~~~
 | Brad Smith
@@ -144,6 +162,12 @@ Driver parts not covered by someone else
 | i\@maskray.me (email), MaskRay (Phabricator), MaskRay (GitHub)
 
 
+Constant Expressions
+~~~~~~~~~~~~~~~~~~~~
+| Mariya Podchishchaeva
+| mariya.podchishchaeva\@intel.com (email), Fznamznon (GitHub), fznamznon (Discord), Fznamznon (Discourse)
+
+
 Tools
 -----
 These maintainers are responsible for user-facing tools under the Clang
@@ -295,6 +319,12 @@ SYCL conformance
 | alexey.bader\@intel.com (email), bader (Phabricator), bader (GitHub)
 
 
+HLSL conformance
+~~~~~~~~~~~~~~~~
+| Chris Bieneman
+| chris.bieneman\@gmail.com (email), llvm-beanz (GitHub), beanz (Discord), beanz (Discourse)
+
+
 Issue Triage
 ~~~~~~~~~~~~
 | Shafik Yaghmour
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index da4ec71..8a9e207 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -3459,8 +3459,8 @@ Raw pointers and references to an object which supports CheckedPtr or CheckedRef
 .. code-block:: cpp
 
  struct CheckableObj {
-   void incrementPtrCount() {}
-   void decrementPtrCount() {}
+   void incrementCheckedPtrCount() {}
+   void decrementCheckedPtrCount() {}
  };
 
  struct Foo {
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index fb0c051..1e8101f 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -23,6 +23,7 @@
 #include "clang/AST/ExternalASTSource.h"
 #include "clang/AST/PrettyPrinter.h"
 #include "clang/AST/RawCommentList.h"
+#include "clang/AST/SYCLKernelInfo.h"
 #include "clang/AST/TemplateName.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/PartialDiagnostic.h"
@@ -1239,6 +1240,11 @@ public:
   /// in device compilation.
   llvm::DenseSet<const FunctionDecl *> CUDAImplicitHostDeviceFunUsedByDevice;
 
+  /// Map of SYCL kernels indexed by the unique type used to name the kernel.
+  /// Entries are not serialized but are recreated on deserialization of a
+  /// sycl_kernel_entry_point attributed function declaration.
+  llvm::DenseMap<CanQualType, SYCLKernelInfo> SYCLKernels;
+
   /// For capturing lambdas with an explicit object parameter whose type is
   /// derived from the lambda type, we need to perform derived-to-base
   /// conversion so we can access the captures; the cast paths for that
@@ -3340,6 +3346,14 @@ public:
   void getFunctionFeatureMap(llvm::StringMap<bool> &FeatureMap,
                              GlobalDecl GD) const;
 
+  /// Generates and stores SYCL kernel metadata for the provided
+  /// SYCL kernel entry point function. The provided function must have
+  /// an attached sycl_kernel_entry_point attribute that specifies a unique
+  /// type for the name of a SYCL kernel. Callers are required to detect
+  /// conflicting SYCL kernel names and issue a diagnostic prior to calling
+  /// this function.
+  void registerSYCLEntryPointFunction(FunctionDecl *FD);
+
   //===--------------------------------------------------------------------===//
   //                    Statistics
   //===--------------------------------------------------------------------===//
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index 7ff35d7..8c39ef3d5 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -737,7 +737,7 @@ class DeclaratorDecl : public ValueDecl {
   // qualifier, to be used for the (uncommon) case of out-of-line declarations
   // and constrained function decls.
   struct ExtInfo : public QualifierInfo {
-    TypeSourceInfo *TInfo;
+    TypeSourceInfo *TInfo = nullptr;
     Expr *TrailingRequiresClause = nullptr;
   };
 
diff --git a/clang/include/clang/AST/SYCLKernelInfo.h b/clang/include/clang/AST/SYCLKernelInfo.h
new file mode 100644
index 0000000..55dba1f8
--- /dev/null
+++ b/clang/include/clang/AST/SYCLKernelInfo.h
@@ -0,0 +1,41 @@
+//===--- SYCLKernelInfo.h --- Information about SYCL kernels --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares types used to describe SYCL kernels.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_AST_SYCLKERNELINFO_H
+#define LLVM_CLANG_AST_SYCLKERNELINFO_H
+
+#include "clang/AST/Decl.h"
+#include "clang/AST/Type.h"
+
+namespace clang {
+
+class SYCLKernelInfo {
+public:
+  SYCLKernelInfo(CanQualType KernelNameType,
+                 const FunctionDecl *KernelEntryPointDecl)
+      : KernelNameType(KernelNameType),
+        KernelEntryPointDecl(KernelEntryPointDecl) {}
+
+  CanQualType getKernelNameType() const { return KernelNameType; }
+
+  const FunctionDecl *getKernelEntryPointDecl() const {
+    return KernelEntryPointDecl;
+  }
+
+private:
+  CanQualType KernelNameType;
+  const FunctionDecl *KernelEntryPointDecl;
+};
+
+} // namespace clang
+
+#endif // LLVM_CLANG_AST_SYCLKERNELINFO_H
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 156fbd1..fbad11b 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -407,7 +407,8 @@ def MicrosoftExt : LangOpt<"MicrosoftExt">;
 def Borland : LangOpt<"Borland">;
 def CUDA : LangOpt<"CUDA">;
 def HIP : LangOpt<"HIP">;
-def SYCL : LangOpt<"SYCLIsDevice">;
+def SYCLHost : LangOpt<"SYCLIsHost">;
+def SYCLDevice : LangOpt<"SYCLIsDevice">;
 def COnly : LangOpt<"", "!LangOpts.CPlusPlus">;
 def CPlusPlus : LangOpt<"CPlusPlus">;
 def OpenCL : LangOpt<"OpenCL">;
@@ -1493,14 +1494,23 @@ def : MutualExclusions<[CUDAConstant, CUDAShared, HIPManaged]>;
 def SYCLKernel : InheritableAttr {
   let Spellings = [Clang<"sycl_kernel">];
   let Subjects = SubjectList<[FunctionTmpl]>;
-  let LangOpts = [SYCL];
+  let LangOpts = [SYCLDevice];
   let Documentation = [SYCLKernelDocs];
 }
 
+def SYCLKernelEntryPoint : InheritableAttr {
+  let Spellings = [Clang<"sycl_kernel_entry_point">];
+  let Args = [TypeArgument<"KernelName">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let TemplateDependent = 1;
+  let LangOpts = [SYCLHost, SYCLDevice];
+  let Documentation = [SYCLKernelEntryPointDocs];
+}
+
 def SYCLSpecialClass: InheritableAttr {
   let Spellings = [Clang<"sycl_special_class">];
   let Subjects = SubjectList<[CXXRecord]>;
-  let LangOpts = [SYCL];
+  let LangOpts = [SYCLDevice];
   let Documentation = [SYCLSpecialClassDocs];
 }
 
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index b497cce..ed251b0 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -455,6 +455,180 @@ The SYCL kernel in the previous code sample meets these expectations.
   }];
 }
 
+def SYCLKernelEntryPointDocs : Documentation {
+  let Category = DocCatFunction;
+  let Content = [{
+The ``sycl_kernel_entry_point`` attribute facilitates the generation of an
+offload kernel entry point, sometimes called a SYCL kernel caller function,
+suitable for invoking a SYCL kernel on an offload device. The attribute is
+intended for use in the implementation of SYCL kernel invocation functions
+like the ``single_task`` and ``parallel_for`` member functions of the
+``sycl::handler`` class specified in section 4.9.4, "Command group ``handler``
+class", of the SYCL 2020 specification.
+
+The attribute requires a single type argument that specifies a class type that
+meets the requirements for a SYCL kernel name as described in section 5.2,
+"Naming of kernels", of the SYCL 2020 specification. A unique kernel name type
+is required for each function declared with the attribute. The attribute may
+not first appear on a declaration that follows a definition of the function.
+
+The attribute only appertains to functions and only those that meet the
+following requirements.
+
+* Has a ``void`` return type.
+* Is not a non-static member function, constructor, or destructor.
+* Is not a C variadic function.
+* Is not a coroutine.
+* Is not defined as deleted or as defaulted.
+* Is not declared with the ``constexpr`` or ``consteval`` specifiers.
+* Is not declared with the ``[[noreturn]]`` attribute.
+
+Use in the implementation of a SYCL kernel invocation function might look as
+follows.
+
+.. code-block:: c++
+
+  namespace sycl {
+  class handler {
+    template<typename KernelNameType, typename KernelType>
+    [[ clang::sycl_kernel_entry_point(KernelNameType) ]]
+    static void kernel_entry_point(KernelType kernel) {
+      kernel();
+    }
+
+  public:
+    template<typename KernelNameType, typename KernelType>
+    void single_task(KernelType kernel) {
+      // Call kernel_entry_point() to trigger generation of an offload
+      // kernel entry point.
+      kernel_entry_point<KernelNameType>(kernel);
+      // Call functions appropriate for the desired offload backend
+      // (OpenCL, CUDA, HIP, Level Zero, etc...).
+    }
+  };
+  } // namespace sycl
+
+A SYCL kernel is a callable object of class type that is constructed on a host,
+often via a lambda expression, and then passed to a SYCL kernel invocation
+function to be executed on an offload device. A SYCL kernel invocation function
+is responsible for copying the provided SYCL kernel object to an offload
+device and initiating a call to it. The SYCL kernel object and its data members
+constitute the parameters of an offload kernel.
+
+A SYCL kernel type is required to satisfy the device copyability requirements
+specified in section 3.13.1, "Device copyable", of the SYCL 2020 specification.
+Additionally, any data members of the kernel object type are required to satisfy
+section 4.12.4, "Rules for parameter passing to kernels". For most types, these
+rules require that the type is trivially copyable.  However, the SYCL
+specification mandates that certain special SYCL types, such as
+``sycl::accessor`` and ``sycl::stream`` be device copyable even if they are not
+trivially copyable. These types require special handling because they cannot
+be copied to device memory as if by ``memcpy()``. Additionally, some offload
+backends, OpenCL for example, require objects of some of these types to be
+passed as individual arguments to the offload kernel.
+
+An offload kernel consists of an entry point function that declares the
+parameters of the offload kernel and the set of all functions and variables that
+are directly or indirectly used by the entry point function.
+
+A SYCL kernel invocation function invokes a SYCL kernel on a device by
+performing the following tasks (likely with the help of an offload backend
+like OpenCL):
+
+#. Identifying the offload kernel entry point to be used for the SYCL kernel.
+
+#. Deconstructing the SYCL kernel object, if necessary, to produce the set of
+   offload kernel arguments required by the offload kernel entry point.
+
+#. Copying the offload kernel arguments to device memory.
+
+#. Initiating execution of the offload kernel entry point.
+
+The offload kernel entry point for a SYCL kernel performs the following tasks:
+
+#. Reconstituting the SYCL kernel object, if necessary, using the offload
+   kernel parameters.
+
+#. Calling the ``operator()`` member function of the (reconstituted) SYCL kernel
+   object.
+
+The ``sycl_kernel_entry_point`` attribute automates generation of an offload
+kernel entry point that performs those latter tasks. The parameters and body of
+a function declared with the ``sycl_kernel_entry_point`` attribute specify a
+pattern from which the parameters and body of the entry point function are
+derived. Consider the following call to a SYCL kernel invocation function.
+
+.. code-block:: c++
+
+  struct S { int i; };
+  void f(sycl::handler &handler, sycl::stream &sout, S s) {
+    handler.single_task<struct KN>([=] {
+      sout << "The value of s.i is " << s.i << "\n";
+    });
+  }
+
+The SYCL kernel object is the result of the lambda expression. It has two
+data members corresponding to the captures of ``sout`` and ``s``. Since one
+of these data members corresponds to a special SYCL type that must be passed
+individually as an offload kernel parameter, it is necessary to decompose the
+SYCL kernel object into its constituent parts; the offload kernel will have
+two kernel parameters. Given a SYCL implementation that uses a
+``sycl_kernel_entry_point`` attributed function like the one shown above, an
+offload kernel entry point function will be generated that looks approximately
+as follows.
+
+.. code-block:: c++
+
+  void sycl-kernel-caller-for-KN(sycl::stream sout, S s) {
+    kernel-type kernel = { sout, s );
+    kernel();
+  }
+
+There are a few items worthy of note:
+
+#. The name of the generated function incorporates the SYCL kernel name,
+   ``KN``, that was passed as the ``KernelNameType`` template parameter to
+   ``kernel_entry_point()`` and provided as the argument to the
+   ``sycl_kernel_entry_point`` attribute. There is a one-to-one correspondence
+   between SYCL kernel names and offload kernel entry points.
+
+#. The SYCL kernel is a lambda closure type and therefore has no name;
+   ``kernel-type`` is substituted above and corresponds to the ``KernelType``
+   template parameter deduced in the call to ``kernel_entry_point()``.
+   Lambda types cannot be declared and initialized using the aggregate
+   initialization syntax used above, but the intended behavior should be clear.
+
+#. ``S`` is a device copyable type that does not directly or indirectly contain
+   a data member of a SYCL special type. It therefore does not need to be
+   decomposed into its constituent members to be passed as a kernel argument.
+
+#. The depiction of the ``sycl::stream`` parameter as a single self contained
+   kernel parameter is an oversimplification. SYCL special types may require
+   additional decomposition such that the generated function might have three
+   or more parameters depending on how the SYCL library implementation defines
+   these types.
+
+#. The call to ``kernel_entry_point()`` has no effect other than to trigger
+   emission of the entry point function. The statments that make up the body
+   of the function are not executed when the function is called; they are
+   only used in the generation of the entry point function.
+
+It is not necessary for a function declared with the ``sycl_kernel_entry_point``
+attribute to be called for the offload kernel entry point to be emitted. For
+inline functions and function templates, any ODR-use will suffice. For other
+functions, an ODR-use is not required; the offload kernel entry point will be
+emitted if the function is defined.
+
+Functions declared with the ``sycl_kernel_entry_point`` attribute are not
+limited to the simple example shown above. They may have additional template
+parameters, declare additional function parameters, and have complex control
+flow in the function body. Function parameter decomposition and reconstitution
+is performed for all function parameters. The function must abide by the
+language feature restrictions described in section 5.4, "Language restrictions
+for device functions" in the SYCL 2020 specification.
+  }];
+}
+
 def SYCLSpecialClassDocs : Documentation {
   let Category = DocCatStmt;
   let Content = [{
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 38a527d..3312d4e 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -2617,6 +2617,19 @@ private:
   /// \#pragma GCC poison/system_header/dependency and \#pragma once.
   void RegisterBuiltinPragmas();
 
+  /// RegisterBuiltinMacro - Register the specified identifier in the identifier
+  /// table and mark it as a builtin macro to be expanded.
+  IdentifierInfo *RegisterBuiltinMacro(const char *Name) {
+    // Get the identifier.
+    IdentifierInfo *Id = getIdentifierInfo(Name);
+
+    // Mark it as being a macro that is builtin.
+    MacroInfo *MI = AllocateMacroInfo(SourceLocation());
+    MI->setIsBuiltinMacro();
+    appendDefMacroDirective(Id, MI);
+    return Id;
+  }
+
   /// Register builtin macros such as __LINE__ with the identifier table.
   void RegisterBuiltinMacros();
 
diff --git a/clang/include/clang/Sema/SemaSYCL.h b/clang/include/clang/Sema/SemaSYCL.h
index 27c42b5..c9f3358 100644
--- a/clang/include/clang/Sema/SemaSYCL.h
+++ b/clang/include/clang/Sema/SemaSYCL.h
@@ -62,6 +62,7 @@ public:
                                        ParsedType ParsedTy);
 
   void handleKernelAttr(Decl *D, const ParsedAttr &AL);
+  void handleKernelEntryPointAttr(Decl *D, const ParsedAttr &AL);
 };
 
 } // namespace clang
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 11e79d2..91a7d4b 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -14411,6 +14411,33 @@ void ASTContext::getFunctionFeatureMap(llvm::StringMap<bool> &FeatureMap,
   }
 }
 
+static SYCLKernelInfo BuildSYCLKernelInfo(CanQualType KernelNameType,
+                                          const FunctionDecl *FD) {
+  return {KernelNameType, FD};
+}
+
+void ASTContext::registerSYCLEntryPointFunction(FunctionDecl *FD) {
+  // If the function declaration to register is invalid or dependent, the
+  // registration attempt is ignored.
+  if (FD->isInvalidDecl() || FD->isTemplated())
+    return;
+
+  const auto *SKEPAttr = FD->getAttr<SYCLKernelEntryPointAttr>();
+  assert(SKEPAttr && "Missing sycl_kernel_entry_point attribute");
+
+  // Be tolerant of multiple registration attempts so long as each attempt
+  // is for the same entity. Callers are obligated to detect and diagnose
+  // conflicting kernel names prior to calling this function.
+  CanQualType KernelNameType = getCanonicalType(SKEPAttr->getKernelName());
+  auto IT = SYCLKernels.find(KernelNameType);
+  assert((IT == SYCLKernels.end() ||
+          declaresSameEntity(FD, IT->second.getKernelEntryPointDecl())) &&
+         "SYCL kernel name conflict");
+  (void)IT;
+  SYCLKernels.insert(
+      std::make_pair(KernelNameType, BuildSYCLKernelInfo(KernelNameType, FD)));
+}
+
 OMPTraitInfo &ASTContext::getNewOMPTraitInfo() {
   OMPTraitInfoVector.emplace_back(new OMPTraitInfo());
   return *OMPTraitInfoVector.back();
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 396213c..7cf2519 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -6446,8 +6446,6 @@ bool Compiler<Emitter>::emitBuiltinBitCast(const CastExpr *E) {
   QualType ToType = E->getType();
   std::optional<PrimType> ToT = classify(ToType);
 
-  assert(!DiscardResult && "Implement DiscardResult mode for bitcasts.");
-
   if (ToType->isNullPtrType()) {
     if (!this->discard(SubExpr))
       return false;
@@ -6463,12 +6461,24 @@ bool Compiler<Emitter>::emitBuiltinBitCast(const CastExpr *E) {
   }
   assert(!ToType->isReferenceType());
 
+  // Prepare storage for the result in case we discard.
+  if (DiscardResult && !Initializing && !ToT) {
+    std::optional<unsigned> LocalIndex = allocateLocal(E);
+    if (!LocalIndex)
+      return false;
+    if (!this->emitGetPtrLocal(*LocalIndex, E))
+      return false;
+  }
+
   // Get a pointer to the value-to-cast on the stack.
   if (!this->visit(SubExpr))
     return false;
 
-  if (!ToT || ToT == PT_Ptr)
-    return this->emitBitCastPtr(E);
+  if (!ToT || ToT == PT_Ptr) {
+    if (!this->emitBitCastPtr(E))
+      return false;
+    return DiscardResult ? this->emitPopPtr(E) : true;
+  }
   assert(ToT);
 
   const llvm::fltSemantics *TargetSemantics = nullptr;
diff --git a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
index e1de151..17a175a 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
@@ -26,8 +26,8 @@ using namespace clang;
 using namespace clang::interp;
 
 /// Used to iterate over pointer fields.
-using DataFunc =
-    llvm::function_ref<bool(const Pointer &P, PrimType Ty, size_t BitOffset)>;
+using DataFunc = llvm::function_ref<bool(const Pointer &P, PrimType Ty,
+                                         size_t BitOffset, bool PackedBools)>;
 
 #define BITCAST_TYPE_SWITCH(Expr, B)                                           \
   do {                                                                         \
@@ -89,6 +89,7 @@ struct BitcastBuffer {
 
   std::byte *getBytes(unsigned BitOffset) const {
     assert(BitOffset % 8 == 0);
+    assert(BitOffset < SizeInBits);
     return const_cast<std::byte *>(data() + (BitOffset / 8));
   }
 
@@ -147,7 +148,7 @@ static bool enumerateData(const Pointer &P, const Context &Ctx, size_t Offset,
 
   // Primitives.
   if (FieldDesc->isPrimitive())
-    return F(P, FieldDesc->getPrimType(), Offset);
+    return F(P, FieldDesc->getPrimType(), Offset, false);
 
   // Primitive arrays.
   if (FieldDesc->isPrimitiveArray()) {
@@ -155,10 +156,12 @@ static bool enumerateData(const Pointer &P, const Context &Ctx, size_t Offset,
     QualType ElemType = FieldDesc->getElemQualType();
     size_t ElemSizeInBits = Ctx.getASTContext().getTypeSize(ElemType);
     PrimType ElemT = *Ctx.classify(ElemType);
+    // Special case, since the bools here are packed.
+    bool PackedBools = FieldDesc->getType()->isExtVectorBoolType();
     bool Ok = true;
     for (unsigned I = 0; I != FieldDesc->getNumElems(); ++I) {
       unsigned Index = BigEndianTarget ? (FieldDesc->getNumElems() - 1 - I) : I;
-      Ok = Ok && F(P.atIndex(Index), ElemT, Offset);
+      Ok = Ok && F(P.atIndex(Index), ElemT, Offset, PackedBools);
       Offset += ElemSizeInBits;
     }
     return Ok;
@@ -302,7 +305,8 @@ static bool readPointerToBuffer(const Context &Ctx, const Pointer &FromPtr,
 
   return enumeratePointerFields(
       FromPtr, Ctx,
-      [&](const Pointer &P, PrimType T, size_t BitOffset) -> bool {
+      [&](const Pointer &P, PrimType T, size_t BitOffset,
+          bool PackedBools) -> bool {
         if (!P.isInitialized()) {
           assert(false && "Implement uninitialized value tracking");
           return ReturnOnUninit;
@@ -334,6 +338,8 @@ static bool readPointerToBuffer(const Context &Ctx, const Pointer &FromPtr,
         } else {
           if (const FieldDecl *FD = P.getField(); FD && FD->isBitField())
             BitWidth = FD->getBitWidthValue(ASTCtx);
+          else if (T == PT_Bool && PackedBools)
+            BitWidth = 1;
 
           BITCAST_TYPE_SWITCH(T, {
             T Val = P.deref<T>();
@@ -401,7 +407,7 @@ bool clang::interp::DoBitCastPtr(InterpState &S, CodePtr OpPC,
   size_t BitOffset = 0;
   bool Success = enumeratePointerFields(
       ToPtr, S.getContext(),
-      [&](const Pointer &P, PrimType T, size_t _) -> bool {
+      [&](const Pointer &P, PrimType T, size_t _, bool PackedBools) -> bool {
         if (T == PT_Float) {
           CharUnits ObjectReprChars = ASTCtx.getTypeSizeInChars(P.getType());
           const auto &Semantics = ASTCtx.getFloatTypeSemantics(P.getType());
diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h
index 30c7ac8..85e4bd9 100644
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@@ -310,8 +310,8 @@ public:
 
     // SPIR-V IDs are represented with a single 32-bit word.
     SizeType = TargetInfo::UnsignedInt;
-    resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-"
-                    "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1");
+    resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-"
+                    "v256:256-v512:512-v1024:1024-n8:16:32:64-G1");
   }
 
   void getTargetDefines(const LangOptions &Opts,
@@ -334,8 +334,8 @@ public:
     // SPIR-V has core support for atomic ops, and Int32 is always available;
     // we take the maximum because it's possible the Host supports wider types.
     MaxAtomicInlineWidth = std::max<unsigned char>(MaxAtomicInlineWidth, 32);
-    resetDataLayout("e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-"
-                    "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1");
+    resetDataLayout("e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-"
+                    "v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1");
   }
 
   void getTargetDefines(const LangOptions &Opts,
@@ -358,8 +358,8 @@ public:
     // SPIR-V has core support for atomic ops, and Int64 is always available;
     // we take the maximum because it's possible the Host supports wider types.
     MaxAtomicInlineWidth = std::max<unsigned char>(MaxAtomicInlineWidth, 64);
-    resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-"
-                    "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1");
+    resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-"
+                    "v256:256-v512:512-v1024:1024-n8:16:32:64-G1");
   }
 
   void getTargetDefines(const LangOptions &Opts,
@@ -384,8 +384,8 @@ public:
     PtrDiffType = IntPtrType = TargetInfo::SignedLong;
     AddrSpaceMap = &SPIRDefIsGenMap;
 
-    resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-"
-                    "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1-P4-A0");
+    resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-"
+                    "v256:256-v512:512-v1024:1024-n32:64-S32-G1-P4-A0");
 
     BFloat16Width = BFloat16Align = 16;
     BFloat16Format = &llvm::APFloat::BFloat();
diff --git a/clang/lib/CodeGen/CGObjCMac.cpp b/clang/lib/CodeGen/CGObjCMac.cpp
index 47ea636..7caf801 100644
--- a/clang/lib/CodeGen/CGObjCMac.cpp
+++ b/clang/lib/CodeGen/CGObjCMac.cpp
@@ -365,7 +365,7 @@ public:
   /// GcReadWeakFn -- LLVM objc_read_weak (id *src) function.
   llvm::FunctionCallee getGcReadWeakFn() {
     // id objc_read_weak (id *)
-    llvm::Type *args[] = { ObjectPtrTy->getPointerTo() };
+    llvm::Type *args[] = {CGM.UnqualPtrTy};
     llvm::FunctionType *FTy =
       llvm::FunctionType::get(ObjectPtrTy, args, false);
     return CGM.CreateRuntimeFunction(FTy, "objc_read_weak");
@@ -374,7 +374,7 @@ public:
   /// GcAssignWeakFn -- LLVM objc_assign_weak function.
   llvm::FunctionCallee getGcAssignWeakFn() {
     // id objc_assign_weak (id, id *)
-    llvm::Type *args[] = { ObjectPtrTy, ObjectPtrTy->getPointerTo() };
+    llvm::Type *args[] = {ObjectPtrTy, CGM.UnqualPtrTy};
     llvm::FunctionType *FTy =
       llvm::FunctionType::get(ObjectPtrTy, args, false);
     return CGM.CreateRuntimeFunction(FTy, "objc_assign_weak");
@@ -383,7 +383,7 @@ public:
   /// GcAssignGlobalFn -- LLVM objc_assign_global function.
   llvm::FunctionCallee getGcAssignGlobalFn() {
     // id objc_assign_global(id, id *)
-    llvm::Type *args[] = { ObjectPtrTy, ObjectPtrTy->getPointerTo() };
+    llvm::Type *args[] = {ObjectPtrTy, CGM.UnqualPtrTy};
     llvm::FunctionType *FTy =
       llvm::FunctionType::get(ObjectPtrTy, args, false);
     return CGM.CreateRuntimeFunction(FTy, "objc_assign_global");
@@ -392,7 +392,7 @@ public:
   /// GcAssignThreadLocalFn -- LLVM objc_assign_threadlocal function.
   llvm::FunctionCallee getGcAssignThreadLocalFn() {
     // id objc_assign_threadlocal(id src, id * dest)
-    llvm::Type *args[] = { ObjectPtrTy, ObjectPtrTy->getPointerTo() };
+    llvm::Type *args[] = {ObjectPtrTy, CGM.UnqualPtrTy};
     llvm::FunctionType *FTy =
       llvm::FunctionType::get(ObjectPtrTy, args, false);
     return CGM.CreateRuntimeFunction(FTy, "objc_assign_threadlocal");
@@ -401,8 +401,7 @@ public:
   /// GcAssignIvarFn -- LLVM objc_assign_ivar function.
   llvm::FunctionCallee getGcAssignIvarFn() {
     // id objc_assign_ivar(id, id *, ptrdiff_t)
-    llvm::Type *args[] = { ObjectPtrTy, ObjectPtrTy->getPointerTo(),
-                           CGM.PtrDiffTy };
+    llvm::Type *args[] = {ObjectPtrTy, CGM.UnqualPtrTy, CGM.PtrDiffTy};
     llvm::FunctionType *FTy =
       llvm::FunctionType::get(ObjectPtrTy, args, false);
     return CGM.CreateRuntimeFunction(FTy, "objc_assign_ivar");
@@ -419,7 +418,7 @@ public:
   /// GcAssignStrongCastFn -- LLVM objc_assign_strongCast function.
   llvm::FunctionCallee getGcAssignStrongCastFn() {
     // id objc_assign_strongCast(id, id *)
-    llvm::Type *args[] = { ObjectPtrTy, ObjectPtrTy->getPointerTo() };
+    llvm::Type *args[] = {ObjectPtrTy, CGM.UnqualPtrTy};
     llvm::FunctionType *FTy =
       llvm::FunctionType::get(ObjectPtrTy, args, false);
     return CGM.CreateRuntimeFunction(FTy, "objc_assign_strongCast");
@@ -554,7 +553,7 @@ public:
 
   /// ExceptionTryEnterFn - LLVM objc_exception_try_enter function.
   llvm::FunctionCallee getExceptionTryEnterFn() {
-    llvm::Type *params[] = { ExceptionDataTy->getPointerTo() };
+    llvm::Type *params[] = {CGM.UnqualPtrTy};
     return CGM.CreateRuntimeFunction(
       llvm::FunctionType::get(CGM.VoidTy, params, false),
       "objc_exception_try_enter");
@@ -562,7 +561,7 @@ public:
 
   /// ExceptionTryExitFn - LLVM objc_exception_try_exit function.
   llvm::FunctionCallee getExceptionTryExitFn() {
-    llvm::Type *params[] = { ExceptionDataTy->getPointerTo() };
+    llvm::Type *params[] = {CGM.UnqualPtrTy};
     return CGM.CreateRuntimeFunction(
       llvm::FunctionType::get(CGM.VoidTy, params, false),
       "objc_exception_try_exit");
@@ -570,7 +569,7 @@ public:
 
   /// ExceptionExtractFn - LLVM objc_exception_extract function.
   llvm::FunctionCallee getExceptionExtractFn() {
-    llvm::Type *params[] = { ExceptionDataTy->getPointerTo() };
+    llvm::Type *params[] = {CGM.UnqualPtrTy};
     return CGM.CreateRuntimeFunction(llvm::FunctionType::get(ObjectPtrTy,
                                                              params, false),
                                      "objc_exception_extract");
@@ -587,7 +586,7 @@ public:
   /// SetJmpFn - LLVM _setjmp function.
   llvm::FunctionCallee getSetJmpFn() {
     // This is specifically the prototype for x86.
-    llvm::Type *params[] = { CGM.Int32Ty->getPointerTo() };
+    llvm::Type *params[] = {CGM.UnqualPtrTy};
     return CGM.CreateRuntimeFunction(
         llvm::FunctionType::get(CGM.Int32Ty, params, false), "_setjmp",
         llvm::AttributeList::get(CGM.getLLVMContext(),
@@ -6051,9 +6050,7 @@ ObjCNonFragileABITypesHelper::ObjCNonFragileABITypesHelper(CodeGen::CodeGenModul
       Int8PtrTy, PropertyListPtrTy);
 
   // ImpnfABITy - LLVM for id (*)(id, SEL, ...)
-  llvm::Type *params[] = { ObjectPtrTy, SelectorPtrTy };
-  ImpnfABITy = llvm::FunctionType::get(ObjectPtrTy, params, false)
-                 ->getPointerTo();
+  ImpnfABITy = CGM.UnqualPtrTy;
 
   // struct _class_t {
   //   struct _class_t *isa;
@@ -6469,8 +6466,7 @@ void CGObjCNonFragileABIMac::GenerateClass(const ObjCImplementationDecl *ID) {
                                    llvm::GlobalValue::ExternalLinkage, nullptr,
                                    "_objc_empty_vtable");
     else
-      ObjCEmptyVtableVar =
-        llvm::ConstantPointerNull::get(ObjCTypes.ImpnfABITy->getPointerTo());
+      ObjCEmptyVtableVar = llvm::ConstantPointerNull::get(CGM.UnqualPtrTy);
   }
 
   // FIXME: Is this correct (that meta class size is never computed)?
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index df80245..93e85f7 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -149,13 +149,9 @@ static std::optional<llvm::Triple>
 getHIPOffloadTargetTriple(const Driver &D, const ArgList &Args) {
   if (!Args.hasArg(options::OPT_offload_EQ)) {
     auto OffloadArchs = Args.getAllArgValues(options::OPT_offload_arch_EQ);
-    if (llvm::is_contained(OffloadArchs, "amdgcnspirv")) {
-      if (OffloadArchs.size() == 1)
-        return llvm::Triple("spirv64-amd-amdhsa");
-      // Mixing specific & SPIR-V compilation is not supported for now.
-      D.Diag(diag::err_drv_only_one_offload_target_supported);
-      return std::nullopt;
-    }
+    if (llvm::is_contained(OffloadArchs, "amdgcnspirv") &&
+        OffloadArchs.size() == 1)
+      return llvm::Triple("spirv64-amd-amdhsa");
     return llvm::Triple("amdgcn-amd-amdhsa"); // Default HIP triple.
   }
   auto TT = getOffloadTargetTriple(D, Args);
@@ -458,6 +454,7 @@ DerivedArgList *Driver::TranslateInputArgs(const InputArgList &Args) const {
     // some build systems. We don't try to be complete here because we don't
     // care to encourage this usage model.
     if (A->getOption().matches(options::OPT_Wp_COMMA) &&
+        A->getNumValues() > 0 &&
         (A->getValue(0) == StringRef("-MD") ||
          A->getValue(0) == StringRef("-MMD"))) {
       // Rewrite to -MD/-MMD along with -MF.
@@ -3477,9 +3474,11 @@ class OffloadingActionBuilder final {
       llvm::StringMap<bool> Features;
       // getHIPOffloadTargetTriple() is known to return valid value as it has
       // been called successfully in the CreateOffloadingDeviceToolChains().
-      auto ArchStr = parseTargetID(
-          *getHIPOffloadTargetTriple(C.getDriver(), C.getInputArgs()), IdStr,
-          &Features);
+      auto T =
+          (IdStr == "amdgcnspirv")
+              ? llvm::Triple("spirv64-amd-amdhsa")
+              : *getHIPOffloadTargetTriple(C.getDriver(), C.getInputArgs());
+      auto ArchStr = parseTargetID(T, IdStr, &Features);
       if (!ArchStr) {
         C.getDriver().Diag(clang::diag::err_drv_bad_target_id) << IdStr;
         C.setContainsError();
@@ -5755,7 +5754,7 @@ InputInfoList Driver::BuildJobsForActionNoCache(
     // We only have to generate a prefix for the host if this is not a top-level
     // action.
     std::string OffloadingPrefix = Action::GetOffloadingFileNamePrefix(
-        A->getOffloadingDeviceKind(), TC->getTriple().normalize(),
+        A->getOffloadingDeviceKind(), EffectiveTriple.normalize(),
         /*CreatePrefixForHost=*/isa<OffloadPackagerJobAction>(A) ||
             !(A->getOffloadingHostActiveKinds() == Action::OFK_None ||
               AtTopLevel));
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index bdf3da0..9774d3f 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1099,6 +1099,12 @@ std::string ToolChain::ComputeLLVMTriple(const ArgList &Args,
   }
   case llvm::Triple::aarch64_32:
     return getTripleString();
+  case llvm::Triple::amdgcn: {
+    llvm::Triple Triple = getTriple();
+    if (Args.getLastArgValue(options::OPT_mcpu_EQ) == "amdgcnspirv")
+      Triple.setArch(llvm::Triple::ArchType::spirv64);
+    return Triple.getTriple();
+  }
   case llvm::Triple::arm:
   case llvm::Triple::armeb:
   case llvm::Triple::thumb:
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index bae05cc..4eb8c4f 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -205,7 +205,7 @@ void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (JA.getType() == types::TY_LLVM_BC)
     return constructLlvmLinkCommand(C, JA, Inputs, Output, Args);
 
-  if (getToolChain().getTriple().isSPIRV())
+  if (getToolChain().getEffectiveTriple().isSPIRV())
     return constructLinkAndEmitSpirvCommand(C, JA, Inputs, Output, Args);
 
   return constructLldCommand(C, JA, Inputs, Output, Args);
@@ -264,12 +264,14 @@ void HIPAMDToolChain::addClangTargetOptions(
     CC1Args.push_back("-fapply-global-visibility-to-externs");
   }
 
-  // For SPIR-V we embed the command-line into the generated binary, in order to
-  // retrieve it at JIT time and be able to do target specific compilation with
-  // options that match the user-supplied ones.
-  if (getTriple().isSPIRV() &&
-      !DriverArgs.hasArg(options::OPT_fembed_bitcode_marker))
-    CC1Args.push_back("-fembed-bitcode=marker");
+  if (getEffectiveTriple().isSPIRV()) {
+    // For SPIR-V we embed the command-line into the generated binary, in order
+    // to retrieve it at JIT time and be able to do target specific compilation
+    // with options that match the user-supplied ones.
+    if (!DriverArgs.hasArg(options::OPT_fembed_bitcode_marker))
+      CC1Args.push_back("-fembed-bitcode=marker");
+    return; // No DeviceLibs for SPIR-V.
+  }
 
   for (auto BCFile : getDeviceLibs(DriverArgs)) {
     CC1Args.push_back(BCFile.ShouldInternalize ? "-mlink-builtin-bitcode"
@@ -361,8 +363,7 @@ llvm::SmallVector<ToolChain::BitCodeLibraryInfo, 12>
 HIPAMDToolChain::getDeviceLibs(const llvm::opt::ArgList &DriverArgs) const {
   llvm::SmallVector<BitCodeLibraryInfo, 12> BCLibs;
   if (DriverArgs.hasArg(options::OPT_nogpulib) ||
-      (getTriple().getArch() == llvm::Triple::spirv64 &&
-       getTriple().getVendor() == llvm::Triple::AMD))
+      getGPUArch(DriverArgs) == "amdgcnspirv")
     return {};
   ArgStringList LibraryPaths;
 
@@ -437,8 +438,8 @@ HIPAMDToolChain::getDeviceLibs(const llvm::opt::ArgList &DriverArgs) const {
 void HIPAMDToolChain::checkTargetID(
     const llvm::opt::ArgList &DriverArgs) const {
   auto PTID = getParsedTargetID(DriverArgs);
-  if (PTID.OptionalTargetID && !PTID.OptionalGPUArch) {
+  if (PTID.OptionalTargetID && !PTID.OptionalGPUArch &&
+      PTID.OptionalTargetID != "amdgcnspirv")
     getDriver().Diag(clang::diag::err_drv_bad_target_id)
         << *PTID.OptionalTargetID;
-  }
 }
diff --git a/clang/lib/Driver/ToolChains/HIPUtility.cpp b/clang/lib/Driver/ToolChains/HIPUtility.cpp
index 9fe4f1e..c8075cb 100644
--- a/clang/lib/Driver/ToolChains/HIPUtility.cpp
+++ b/clang/lib/Driver/ToolChains/HIPUtility.cpp
@@ -304,10 +304,14 @@ void HIP::constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
   for (const auto &II : Inputs) {
     const auto *A = II.getAction();
     auto ArchStr = llvm::StringRef(A->getOffloadingArch());
-    BundlerTargetArg +=
-        "," + OffloadKind + "-" + normalizeForBundler(TT, !ArchStr.empty());
+    BundlerTargetArg += ',' + OffloadKind + '-';
+    if (ArchStr == "amdgcnspirv")
+      BundlerTargetArg +=
+          normalizeForBundler(llvm::Triple("spirv64-amd-amdhsa"), true);
+    else
+      BundlerTargetArg += normalizeForBundler(TT, !ArchStr.empty());
     if (!ArchStr.empty())
-      BundlerTargetArg += "-" + ArchStr.str();
+      BundlerTargetArg += '-' + ArchStr.str();
   }
   BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg));
 
diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
index 3eef3dc..b757e20 100644
--- a/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/clang/lib/Lex/PPMacroExpansion.cpp
@@ -323,84 +323,69 @@ void Preprocessor::dumpMacroInfo(const IdentifierInfo *II) {
   }
 }
 
-/// RegisterBuiltinMacro - Register the specified identifier in the identifier
-/// table and mark it as a builtin macro to be expanded.
-static IdentifierInfo *RegisterBuiltinMacro(Preprocessor &PP, const char *Name){
-  // Get the identifier.
-  IdentifierInfo *Id = PP.getIdentifierInfo(Name);
-
-  // Mark it as being a macro that is builtin.
-  MacroInfo *MI = PP.AllocateMacroInfo(SourceLocation());
-  MI->setIsBuiltinMacro();
-  PP.appendDefMacroDirective(Id, MI);
-  return Id;
-}
-
 /// RegisterBuiltinMacros - Register builtin macros, such as __LINE__ with the
 /// identifier table.
 void Preprocessor::RegisterBuiltinMacros() {
-  Ident__LINE__ = RegisterBuiltinMacro(*this, "__LINE__");
-  Ident__FILE__ = RegisterBuiltinMacro(*this, "__FILE__");
-  Ident__DATE__ = RegisterBuiltinMacro(*this, "__DATE__");
-  Ident__TIME__ = RegisterBuiltinMacro(*this, "__TIME__");
-  Ident__COUNTER__ = RegisterBuiltinMacro(*this, "__COUNTER__");
-  Ident_Pragma  = RegisterBuiltinMacro(*this, "_Pragma");
-  Ident__FLT_EVAL_METHOD__ = RegisterBuiltinMacro(*this, "__FLT_EVAL_METHOD__");
+  Ident__LINE__ = RegisterBuiltinMacro("__LINE__");
+  Ident__FILE__ = RegisterBuiltinMacro("__FILE__");
+  Ident__DATE__ = RegisterBuiltinMacro("__DATE__");
+  Ident__TIME__ = RegisterBuiltinMacro("__TIME__");
+  Ident__COUNTER__ = RegisterBuiltinMacro("__COUNTER__");
+  Ident_Pragma = RegisterBuiltinMacro("_Pragma");
+  Ident__FLT_EVAL_METHOD__ = RegisterBuiltinMacro("__FLT_EVAL_METHOD__");
 
   // C++ Standing Document Extensions.
   if (getLangOpts().CPlusPlus)
-    Ident__has_cpp_attribute =
-        RegisterBuiltinMacro(*this, "__has_cpp_attribute");
+    Ident__has_cpp_attribute = RegisterBuiltinMacro("__has_cpp_attribute");
   else
     Ident__has_cpp_attribute = nullptr;
 
   // GCC Extensions.
-  Ident__BASE_FILE__     = RegisterBuiltinMacro(*this, "__BASE_FILE__");
-  Ident__INCLUDE_LEVEL__ = RegisterBuiltinMacro(*this, "__INCLUDE_LEVEL__");
-  Ident__TIMESTAMP__     = RegisterBuiltinMacro(*this, "__TIMESTAMP__");
+  Ident__BASE_FILE__ = RegisterBuiltinMacro("__BASE_FILE__");
+  Ident__INCLUDE_LEVEL__ = RegisterBuiltinMacro("__INCLUDE_LEVEL__");
+  Ident__TIMESTAMP__ = RegisterBuiltinMacro("__TIMESTAMP__");
 
   // Microsoft Extensions.
   if (getLangOpts().MicrosoftExt) {
-    Ident__identifier = RegisterBuiltinMacro(*this, "__identifier");
-    Ident__pragma = RegisterBuiltinMacro(*this, "__pragma");
+    Ident__identifier = RegisterBuiltinMacro("__identifier");
+    Ident__pragma = RegisterBuiltinMacro("__pragma");
   } else {
     Ident__identifier = nullptr;
     Ident__pragma = nullptr;
   }
 
   // Clang Extensions.
-  Ident__FILE_NAME__      = RegisterBuiltinMacro(*this, "__FILE_NAME__");
-  Ident__has_feature      = RegisterBuiltinMacro(*this, "__has_feature");
-  Ident__has_extension    = RegisterBuiltinMacro(*this, "__has_extension");
-  Ident__has_builtin      = RegisterBuiltinMacro(*this, "__has_builtin");
+  Ident__FILE_NAME__ = RegisterBuiltinMacro("__FILE_NAME__");
+  Ident__has_feature = RegisterBuiltinMacro("__has_feature");
+  Ident__has_extension = RegisterBuiltinMacro("__has_extension");
+  Ident__has_builtin = RegisterBuiltinMacro("__has_builtin");
   Ident__has_constexpr_builtin =
-      RegisterBuiltinMacro(*this, "__has_constexpr_builtin");
-  Ident__has_attribute    = RegisterBuiltinMacro(*this, "__has_attribute");
+      RegisterBuiltinMacro("__has_constexpr_builtin");
+  Ident__has_attribute = RegisterBuiltinMacro("__has_attribute");
   if (!getLangOpts().CPlusPlus)
-    Ident__has_c_attribute = RegisterBuiltinMacro(*this, "__has_c_attribute");
+    Ident__has_c_attribute = RegisterBuiltinMacro("__has_c_attribute");
   else
     Ident__has_c_attribute = nullptr;
 
-  Ident__has_declspec = RegisterBuiltinMacro(*this, "__has_declspec_attribute");
-  Ident__has_embed = RegisterBuiltinMacro(*this, "__has_embed");
-  Ident__has_include      = RegisterBuiltinMacro(*this, "__has_include");
-  Ident__has_include_next = RegisterBuiltinMacro(*this, "__has_include_next");
-  Ident__has_warning      = RegisterBuiltinMacro(*this, "__has_warning");
-  Ident__is_identifier    = RegisterBuiltinMacro(*this, "__is_identifier");
-  Ident__is_target_arch   = RegisterBuiltinMacro(*this, "__is_target_arch");
-  Ident__is_target_vendor = RegisterBuiltinMacro(*this, "__is_target_vendor");
-  Ident__is_target_os     = RegisterBuiltinMacro(*this, "__is_target_os");
+  Ident__has_declspec = RegisterBuiltinMacro("__has_declspec_attribute");
+  Ident__has_embed = RegisterBuiltinMacro("__has_embed");
+  Ident__has_include = RegisterBuiltinMacro("__has_include");
+  Ident__has_include_next = RegisterBuiltinMacro("__has_include_next");
+  Ident__has_warning = RegisterBuiltinMacro("__has_warning");
+  Ident__is_identifier = RegisterBuiltinMacro("__is_identifier");
+  Ident__is_target_arch = RegisterBuiltinMacro("__is_target_arch");
+  Ident__is_target_vendor = RegisterBuiltinMacro("__is_target_vendor");
+  Ident__is_target_os = RegisterBuiltinMacro("__is_target_os");
   Ident__is_target_environment =
-      RegisterBuiltinMacro(*this, "__is_target_environment");
-  Ident__is_target_variant_os =
-      RegisterBuiltinMacro(*this, "__is_target_variant_os");
+      RegisterBuiltinMacro("__is_target_environment");
+  Ident__is_target_variant_os = RegisterBuiltinMacro("__is_target_variant_os");
   Ident__is_target_variant_environment =
-      RegisterBuiltinMacro(*this, "__is_target_variant_environment");
+      RegisterBuiltinMacro("__is_target_variant_environment");
 
   // Modules.
-  Ident__building_module  = RegisterBuiltinMacro(*this, "__building_module");
+  Ident__building_module = RegisterBuiltinMacro("__building_module");
   if (!getLangOpts().CurrentModule.empty())
-    Ident__MODULE__ = RegisterBuiltinMacro(*this, "__MODULE__");
+    Ident__MODULE__ = RegisterBuiltinMacro("__MODULE__");
   else
     Ident__MODULE__ = nullptr;
 }
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index ce85644..a836544 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -14,8 +14,6 @@
 #include "clang/AST/Attr.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/Type.h"
-#include "clang/Basic/AttrKinds.h"
-#include "clang/Basic/HLSLRuntime.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/Sema.h"
@@ -164,16 +162,7 @@ struct BuiltinTypeDeclBuilder {
                                VD, false, NameInfo, Ty, VK_PRValue);
   }
 
-  static Expr *emitResourceClassExpr(ASTContext &AST, ResourceClass RC) {
-    return IntegerLiteral::Create(
-        AST,
-        llvm::APInt(AST.getIntWidth(AST.UnsignedCharTy),
-                    static_cast<uint8_t>(RC)),
-        AST.UnsignedCharTy, SourceLocation());
-  }
-
-  BuiltinTypeDeclBuilder &addDefaultHandleConstructor(Sema &S,
-                                                      ResourceClass RC) {
+  BuiltinTypeDeclBuilder &addDefaultHandleConstructor(Sema &S) {
     if (Record->isCompleteDefinition())
       return *this;
     ASTContext &AST = Record->getASTContext();
@@ -480,7 +469,7 @@ static BuiltinTypeDeclBuilder setupBufferType(CXXRecordDecl *Decl, Sema &S,
                                               bool IsROV, bool RawBuffer) {
   return BuiltinTypeDeclBuilder(Decl)
       .addHandleMember(S, RC, RK, IsROV, RawBuffer)
-      .addDefaultHandleConstructor(S, RC);
+      .addDefaultHandleConstructor(S);
 }
 
 void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 1aa3e8e..00c8f87 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -12094,6 +12094,9 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD,
   if (LangOpts.OpenMP)
     OpenMP().ActOnFinishedFunctionDefinitionInOpenMPAssumeScope(NewFD);
 
+  if (LangOpts.isSYCL() && NewFD->hasAttr<SYCLKernelEntryPointAttr>())
+    getASTContext().registerSYCLEntryPointFunction(NewFD);
+
   // Semantic checking for this function declaration (in isolation).
 
   if (getLangOpts().CPlusPlus) {
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 601c6f2..a90f870 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -6620,6 +6620,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_SYCLKernel:
     S.SYCL().handleKernelAttr(D, AL);
     break;
+  case ParsedAttr::AT_SYCLKernelEntryPoint:
+    S.SYCL().handleKernelEntryPointAttr(D, AL);
+    break;
   case ParsedAttr::AT_SYCLSpecialClass:
     handleSimpleAttribute<SYCLSpecialClassAttr>(S, D, AL);
     break;
diff --git a/clang/lib/Sema/SemaFunctionEffects.cpp b/clang/lib/Sema/SemaFunctionEffects.cpp
index 3fa326d..ab728f2 100644
--- a/clang/lib/Sema/SemaFunctionEffects.cpp
+++ b/clang/lib/Sema/SemaFunctionEffects.cpp
@@ -971,6 +971,7 @@ private:
     PendingFunctionAnalysis &CurrentFunction;
     CallableInfo &CurrentCaller;
     ViolationSite VSite;
+    const Expr *TrailingRequiresClause = nullptr;
 
     FunctionBodyASTVisitor(Analyzer &Outer,
                            PendingFunctionAnalysis &CurrentFunction,
@@ -985,6 +986,9 @@ private:
       if (auto *Dtor = dyn_cast<CXXDestructorDecl>(CurrentCaller.CDecl))
         followDestructor(dyn_cast<CXXRecordDecl>(Dtor->getParent()), Dtor);
 
+      if (auto *FD = dyn_cast<FunctionDecl>(CurrentCaller.CDecl))
+        TrailingRequiresClause = FD->getTrailingRequiresClause();
+
       // Do an AST traversal of the function/block body
       TraverseDecl(const_cast<Decl *>(CurrentCaller.CDecl));
     }
@@ -1259,6 +1263,17 @@ private:
       return true;
     }
 
+    bool TraverseStmt(Stmt *Statement) {
+      // If this statement is a `requires` clause from the top-level function
+      // being traversed, ignore it, since it's not generating runtime code.
+      // We skip the traversal of lambdas (beyond their captures, see
+      // TraverseLambdaExpr below), so just caching this from our constructor
+      // should suffice.
+      if (Statement != TrailingRequiresClause)
+        return Base::TraverseStmt(Statement);
+      return true;
+    }
+
     bool TraverseConstructorInitializer(CXXCtorInitializer *Init) {
       ViolationSite PrevVS = VSite;
       if (Init->isAnyMemberInitializer())
@@ -1297,6 +1312,7 @@ private:
     }
 
     bool TraverseBlockExpr(BlockExpr * /*unused*/) {
+      // As with lambdas, don't traverse the block's body.
       // TODO: are the capture expressions (ctor call?) safe?
       return true;
     }
diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp
index f2d13d4..e7ceceb 100644
--- a/clang/lib/Sema/SemaSYCL.cpp
+++ b/clang/lib/Sema/SemaSYCL.cpp
@@ -198,3 +198,12 @@ void SemaSYCL::handleKernelAttr(Decl *D, const ParsedAttr &AL) {
 
   handleSimpleAttribute<SYCLKernelAttr>(*this, D, AL);
 }
+
+void SemaSYCL::handleKernelEntryPointAttr(Decl *D, const ParsedAttr &AL) {
+  ParsedType PT = AL.getTypeArg();
+  TypeSourceInfo *TSI = nullptr;
+  (void)SemaRef.GetTypeFromParser(PT, &TSI);
+  assert(TSI && "no type source info for attribute argument");
+  D->addAttr(::new (SemaRef.Context)
+                 SYCLKernelEntryPointAttr(SemaRef.Context, AL, TSI));
+}
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index d4e392d..20edd53 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -1155,6 +1155,14 @@ void ASTDeclReader::VisitFunctionDecl(FunctionDecl *FD) {
   for (unsigned I = 0; I != NumParams; ++I)
     Params.push_back(readDeclAs<ParmVarDecl>());
   FD->setParams(Reader.getContext(), Params);
+
+  // If the declaration is a SYCL kernel entry point function as indicated by
+  // the presence of a sycl_kernel_entry_point attribute, register it so that
+  // associated metadata is recreated.
+  if (FD->hasAttr<SYCLKernelEntryPointAttr>()) {
+    ASTContext &C = Reader.getContext();
+    C.registerSYCLEntryPointFunction(FD);
+  }
 }
 
 void ASTDeclReader::VisitObjCMethodDecl(ObjCMethodDecl *MD) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index 46819d5..487cde2 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -108,7 +108,8 @@ std::optional<bool> isRefCountable(const clang::CXXRecordDecl *R) {
 }
 
 std::optional<bool> isCheckedPtrCapable(const clang::CXXRecordDecl *R) {
-  return isSmartPtrCompatible(R, "incrementPtrCount", "decrementPtrCount");
+  return isSmartPtrCompatible(R, "incrementCheckedPtrCount",
+                              "decrementCheckedPtrCount");
 }
 
 bool isRefType(const std::string &Name) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
index 1a5a730..177c196 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
@@ -96,7 +96,8 @@ public:
           auto name = safeGetName(MD);
           if (name == "ref" || name == "deref")
             return;
-          if (name == "incrementPtrCount" || name == "decrementPtrCount")
+          if (name == "incrementCheckedPtrCount" ||
+              name == "decrementCheckedPtrCount")
             return;
         }
         auto *E = MemberCallExpr->getImplicitObjectArgument();
diff --git a/clang/test/AST/ByteCode/builtin-bit-cast.cpp b/clang/test/AST/ByteCode/builtin-bit-cast.cpp
index 0c55155..7d1fcbd 100644
--- a/clang/test/AST/ByteCode/builtin-bit-cast.cpp
+++ b/clang/test/AST/ByteCode/builtin-bit-cast.cpp
@@ -38,6 +38,13 @@ constexpr Init round_trip(const Init &init) {
   return bit_cast<Init>(bit_cast<Intermediate>(init));
 }
 
+
+namespace Discarding {
+  struct S { int a; };
+  constexpr int f = (__builtin_bit_cast(int, 2), 0);
+  constexpr int f2 = (__builtin_bit_cast(S, 2), 0);
+}
+
 namespace std {
 enum byte : unsigned char {};
 } // namespace std
@@ -468,8 +475,52 @@ struct ref_mem {
 // both-note@+1 {{bit_cast from a type with a reference member is not allowed in a constant expression}}
 constexpr intptr_t run_ref_mem = __builtin_bit_cast(intptr_t, ref_mem{global_int});
 
+namespace test_vector {
+
+typedef unsigned uint2 __attribute__((vector_size(2 * sizeof(unsigned))));
+typedef char byte8 __attribute__((vector_size(sizeof(unsigned long long))));
+
+constexpr uint2 test_vector = { 0x0C05FEFE, 0xCAFEBABE };
+
+static_assert(bit_cast<unsigned long long>(test_vector) == (LITTLE_END
+                                                                ? 0xCAFEBABE0C05FEFE
+                                                                : 0x0C05FEFECAFEBABE), "");
+static_assert(check_round_trip<uint2>(0xCAFEBABE0C05FEFEULL), "");
+static_assert(check_round_trip<byte8>(0xCAFEBABE0C05FEFEULL), "");
+
+typedef bool bool8 __attribute__((ext_vector_type(8)));
+typedef bool bool9 __attribute__((ext_vector_type(9)));
+typedef bool bool16 __attribute__((ext_vector_type(16)));
+typedef bool bool17 __attribute__((ext_vector_type(17)));
+typedef bool bool32 __attribute__((ext_vector_type(32)));
+typedef bool bool128 __attribute__((ext_vector_type(128)));
 
+static_assert(bit_cast<unsigned char>(bool8{1,0,1,0,1,0,1,0}) == (LITTLE_END ? 0x55 : 0xAA), "");
+constexpr bool8 b8 = __builtin_bit_cast(bool8, 0x55); // both-error {{__builtin_bit_cast source size does not equal destination size (4 vs 1)}}
+#if 0
+static_assert(check_round_trip<bool8>(static_cast<unsigned char>(0)), "");
+static_assert(check_round_trip<bool8>(static_cast<unsigned char>(1)), "");
+static_assert(check_round_trip<bool8>(static_cast<unsigned char>(0x55)), "");
+
+static_assert(bit_cast<unsigned short>(bool16{1,1,1,1,1,0,0,0, 1,1,1,1,0,1,0,0}) == (LITTLE_END ? 0x2F1F : 0xF8F4), "");
+
+static_assert(check_round_trip<bool16>(static_cast<short>(0xCAFE)), "");
+static_assert(check_round_trip<bool32>(static_cast<int>(0xCAFEBABE)), "");
+static_assert(check_round_trip<bool128>(static_cast<__int128_t>(0xCAFEBABE0C05FEFEULL)), "");
+#endif
 
+#if 0
+// expected-error@+2 {{constexpr variable 'bad_bool9_to_short' must be initialized by a constant expression}}
+// expected-note@+1 {{bit_cast involving type 'bool __attribute__((ext_vector_type(9)))' (vector of 9 'bool' values) is not allowed in a constant expression; element size 1 * element count 9 is not a multiple of the byte size 8}}
+constexpr unsigned short bad_bool9_to_short = __builtin_bit_cast(unsigned short, bool9{1,1,0,1,0,1,0,1,0});
+// expected-error@+2 {{constexpr variable 'bad_short_to_bool9' must be initialized by a constant expression}}
+// expected-note@+1 {{bit_cast involving type 'bool __attribute__((ext_vector_type(9)))' (vector of 9 'bool' values) is not allowed in a constant expression; element size 1 * element count 9 is not a multiple of the byte size 8}}
+constexpr bool9 bad_short_to_bool9 = __builtin_bit_cast(bool9, static_cast<unsigned short>(0));
+// expected-error@+2 {{constexpr variable 'bad_int_to_bool17' must be initialized by a constant expression}}
+// expected-note@+1 {{bit_cast involving type 'bool __attribute__((ext_vector_type(17)))' (vector of 17 'bool' values) is not allowed in a constant expression; element size 1 * element count 17 is not a multiple of the byte size 8}}
+constexpr bool17 bad_int_to_bool17 = __builtin_bit_cast(bool17, 0x0001CAFEU);
+#endif
+}
 
 namespace test_complex {
   constexpr _Complex unsigned test_int_complex = { 0x0C05FEFE, 0xCAFEBABE };
diff --git a/clang/test/AST/HLSL/RWBuffer-AST.hlsl b/clang/test/AST/HLSL/RWBuffer-AST.hlsl
index e6ce73d..ebddd72 100644
--- a/clang/test/AST/HLSL/RWBuffer-AST.hlsl
+++ b/clang/test/AST/HLSL/RWBuffer-AST.hlsl
@@ -15,7 +15,7 @@
 // EMPTY-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit <undeserialized declarations> class RWBuffer
 // EMPTY-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
 
-// There should be no more occurrances of RWBuffer
+// There should be no more occurrences of RWBuffer
 // EMPTY-NOT: RWBuffer
 
 #ifndef EMPTY
diff --git a/clang/test/AST/HLSL/RWStructuredBuffer-AST.hlsl b/clang/test/AST/HLSL/RWStructuredBuffer-AST.hlsl
index f95d74b..4104250 100644
--- a/clang/test/AST/HLSL/RWStructuredBuffer-AST.hlsl
+++ b/clang/test/AST/HLSL/RWStructuredBuffer-AST.hlsl
@@ -16,7 +16,7 @@
 // EMPTY-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit <undeserialized declarations> class RWStructuredBuffer
 // EMPTY-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
 
-// There should be no more occurrances of RWStructuredBuffer
+// There should be no more occurrences of RWStructuredBuffer
 // EMPTY-NOT: {{[^[:alnum:]]}}RWStructuredBuffer
 
 #ifndef EMPTY
diff --git a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl
index 6c39be8..42a7d1b 100644
--- a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl
+++ b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl
@@ -16,7 +16,7 @@
 // EMPTY-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit <undeserialized declarations> class StructuredBuffer
 // EMPTY-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
 
-// There should be no more occurrances of StructuredBuffer
+// There should be no more occurrences of StructuredBuffer
 // EMPTY-NOT: {{[^[:alnum:]]}}StructuredBuffer
 
 #ifndef EMPTY
diff --git a/clang/test/ASTSYCL/ast-dump-sycl-kernel-entry-point.cpp b/clang/test/ASTSYCL/ast-dump-sycl-kernel-entry-point.cpp
new file mode 100644
index 0000000..c351f3b
--- /dev/null
+++ b/clang/test/ASTSYCL/ast-dump-sycl-kernel-entry-point.cpp
@@ -0,0 +1,144 @@
+// Tests without serialization:
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown -fsycl-is-device \
+// RUN:   -ast-dump %s \
+// RUN:   | FileCheck --match-full-lines %s
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown -fsycl-is-host \
+// RUN:   -ast-dump %s \
+// RUN:   | FileCheck --match-full-lines %s
+//
+// Tests with serialization:
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown -fsycl-is-device \
+// RUN:   -emit-pch -o %t %s
+// RUN: %clang_cc1 -x c++ -std=c++17 -triple x86_64-unknown-unknown -fsycl-is-device \
+// RUN:   -include-pch %t -ast-dump-all /dev/null \
+// RUN:   | sed -e "s/ <undeserialized declarations>//" -e "s/ imported//" \
+// RUN:   | FileCheck --match-full-lines %s
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown -fsycl-is-host \
+// RUN:   -emit-pch -o %t %s
+// RUN: %clang_cc1 -x c++ -std=c++17 -triple x86_64-unknown-unknown -fsycl-is-host \
+// RUN:   -include-pch %t -ast-dump-all /dev/null \
+// RUN:   | sed -e "s/ <undeserialized declarations>//" -e "s/ imported//" \
+// RUN:   | FileCheck --match-full-lines %s
+
+// These tests validate the AST produced for functions declared with the
+// sycl_kernel_entry_point attribute.
+
+// CHECK: TranslationUnitDecl {{.*}}
+
+// A unique kernel name type is required for each declared kernel entry point.
+template<int, int=0> struct KN;
+
+__attribute__((sycl_kernel_entry_point(KN<1>)))
+void skep1() {
+}
+// CHECK:      |-FunctionDecl {{.*}} skep1 'void ()'
+// CHECK:      | `-SYCLKernelEntryPointAttr {{.*}} KN<1>
+
+using KN2 = KN<2>;
+__attribute__((sycl_kernel_entry_point(KN2)))
+void skep2() {
+}
+// CHECK:      |-FunctionDecl {{.*}} skep2 'void ()'
+// CHECK:      | `-SYCLKernelEntryPointAttr {{.*}} KN2
+
+template<int I> using KNT = KN<I>;
+__attribute__((sycl_kernel_entry_point(KNT<3>)))
+void skep3() {
+}
+// CHECK:      |-FunctionDecl {{.*}} skep3 'void ()'
+// CHECK:      | `-SYCLKernelEntryPointAttr {{.*}} KNT<3>
+
+template<typename KNT, typename F>
+[[clang::sycl_kernel_entry_point(KNT)]]
+void skep4(F f) {
+  f();
+}
+// CHECK:      |-FunctionTemplateDecl {{.*}} skep4
+// CHECK-NEXT: | |-TemplateTypeParmDecl {{.*}} KNT
+// CHECK-NEXT: | |-TemplateTypeParmDecl {{.*}} F
+// CHECK-NEXT: | |-FunctionDecl {{.*}} skep4 'void (F)'
+// CHECK:      | | `-SYCLKernelEntryPointAttr {{.*}} KNT
+
+void test_skep4() {
+  skep4<KNT<4>>([]{});
+}
+// CHECK:      | `-FunctionDecl {{.*}} used skep4 'void ((lambda at {{.*}}))' implicit_instantiation
+// CHECK-NEXT: |   |-TemplateArgument type 'KN<4>'
+// CHECK:      |   |-TemplateArgument type '(lambda at {{.*}})'
+// CHECK:      |   `-SYCLKernelEntryPointAttr {{.*}} struct KN<4>
+// CHECK-NEXT: |-FunctionDecl {{.*}} test_skep4 'void ()'
+
+template<typename KNT, typename T>
+[[clang::sycl_kernel_entry_point(KNT)]]
+void skep5(T) {
+}
+// CHECK:      |-FunctionTemplateDecl {{.*}} skep5
+// CHECK-NEXT: | |-TemplateTypeParmDecl {{.*}} KNT
+// CHECK-NEXT: | |-TemplateTypeParmDecl {{.*}} T
+// CHECK-NEXT: | |-FunctionDecl {{.*}} skep5 'void (T)'
+// CHECK:      | | `-SYCLKernelEntryPointAttr {{.*}} KNT
+
+// Checks for the explicit template instantiation declaration below.
+// CHECK:      | `-FunctionDecl {{.*}} skep5 'void (int)' explicit_instantiation_definition
+// CHECK-NEXT: |   |-TemplateArgument type 'KN<5, 4>'
+// CHECK:      |   |-TemplateArgument type 'int'
+// CHECK:      |   `-SYCLKernelEntryPointAttr {{.*}} KN<5, 4>
+
+// FIXME: C++23 [temp.expl.spec]p12 states:
+// FIXME:   ... Similarly, attributes appearing in the declaration of a template
+// FIXME:   have no effect on an explicit specialization of that template.
+// FIXME: Clang currently instantiates and propagates attributes from a function
+// FIXME: template to its explicit specializations resulting in the following
+// FIXME: explicit specialization having an attribute incorrectly attached.
+template<>
+void skep5<KN<5,1>>(short) {
+}
+// CHECK:      |-FunctionDecl {{.*}} prev {{.*}} skep5 'void (short)' explicit_specialization
+// CHECK-NEXT: | |-TemplateArgument type 'KN<5, 1>'
+// CHECK:      | |-TemplateArgument type 'short'
+// CHECK:      | `-SYCLKernelEntryPointAttr {{.*}} Inherited struct KN<5, 1>
+
+template<>
+[[clang::sycl_kernel_entry_point(KN<5,2>)]]
+void skep5<KN<5,2>>(long) {
+}
+// CHECK:      |-FunctionDecl {{.*}} prev {{.*}} skep5 'void (long)' explicit_specialization
+// CHECK-NEXT: | |-TemplateArgument type 'KN<5, 2>'
+// CHECK:      | |-TemplateArgument type 'long'
+// CHECK:      | `-SYCLKernelEntryPointAttr {{.*}} KN<5, 2>
+
+template<>
+[[clang::sycl_kernel_entry_point(KN<5,3>)]]
+void skep5<KN<5,-1>>(long long) {
+}
+// CHECK:      |-FunctionDecl {{.*}} prev {{.*}} skep5 'void (long long)' explicit_specialization
+// CHECK-NEXT: | |-TemplateArgument type 'KN<5, -1>'
+// CHECK:      | |-TemplateArgument type 'long long'
+// CHECK:      | `-SYCLKernelEntryPointAttr {{.*}} KN<5, 3>
+
+template void skep5<KN<5,4>>(int);
+// Checks are located with the primary template declaration above.
+
+// Ensure that matching attributes from multiple declarations are ok.
+[[clang::sycl_kernel_entry_point(KN<6>)]]
+void skep6();
+[[clang::sycl_kernel_entry_point(KN<6>)]]
+void skep6() {
+}
+// CHECK:      |-FunctionDecl {{.*}} skep6 'void ()'
+// CHECK-NEXT: | `-SYCLKernelEntryPointAttr {{.*}} KN<6>
+// CHECK-NEXT: |-FunctionDecl {{.*}} prev {{.*}} skep6 'void ()'
+// CHECK-NEXT: | |-CompoundStmt {{.*}}
+// CHECK-NEXT: | `-SYCLKernelEntryPointAttr {{.*}} KN<6>
+
+// Ensure that matching attributes from the same declaration are ok.
+[[clang::sycl_kernel_entry_point(KN<7>), clang::sycl_kernel_entry_point(KN<7>)]]
+void skep7() {
+}
+// CHECK:      |-FunctionDecl {{.*}} skep7 'void ()'
+// CHECK-NEXT: | |-CompoundStmt {{.*}}
+// CHECK-NEXT: | |-SYCLKernelEntryPointAttr {{.*}} KN<7>
+// CHECK-NEXT: | `-SYCLKernelEntryPointAttr {{.*}} KN<7>
+
+void the_end() {}
+// CHECK:      `-FunctionDecl {{.*}} the_end 'void ()'
diff --git a/clang/test/Analysis/Checkers/WebKit/mock-types.h b/clang/test/Analysis/Checkers/WebKit/mock-types.h
index 8d95926..9c9326f 100644
--- a/clang/test/Analysis/Checkers/WebKit/mock-types.h
+++ b/clang/test/Analysis/Checkers/WebKit/mock-types.h
@@ -146,9 +146,9 @@ private:
 
 public:
   CheckedRef() : t{} {};
-  CheckedRef(T &t) : t(&t) { t.incrementPtrCount(); }
-  CheckedRef(const CheckedRef &o) : t(o.t) { if (t) t->incrementPtrCount(); }
-  ~CheckedRef() { if (t) t->decrementPtrCount(); }
+  CheckedRef(T &t) : t(&t) { t.incrementCheckedPtrCount(); }
+  CheckedRef(const CheckedRef &o) : t(o.t) { if (t) t->incrementCheckedPtrCount(); }
+  ~CheckedRef() { if (t) t->decrementCheckedPtrCount(); }
   T &get() { return *t; }
   T *ptr() { return t; }
   T *operator->() { return t; }
@@ -165,14 +165,14 @@ public:
   CheckedPtr(T *t)
     : t(t) {
     if (t)
-      t->incrementPtrCount();
+      t->incrementCheckedPtrCount();
   }
   CheckedPtr(Ref<T> &&o)
     : t(o.leakRef())
   { }
   ~CheckedPtr() {
     if (t)
-      t->decrementPtrCount();
+      t->decrementCheckedPtrCount();
   }
   T *get() { return t; }
   T *operator->() { return t; }
@@ -184,16 +184,16 @@ public:
 
 class CheckedObj {
 public:
-  void incrementPtrCount();
-  void decrementPtrCount();
+  void incrementCheckedPtrCount();
+  void decrementCheckedPtrCount();
   void method();
   int trivial() { return 123; }
 };
 
 class RefCountableAndCheckable {
 public:
-  void incrementPtrCount() const;
-  void decrementPtrCount() const;
+  void incrementCheckedPtrCount() const;
+  void decrementCheckedPtrCount() const;
   void ref() const;
   void deref() const;
   void method();
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index 26a1bf2..2dc6ead 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -271,4 +271,4 @@
 
 // RUN: %clang_cc1 -triple spirv64-amd-amdhsa -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=AMDGPUSPIRV64
-// AMDGPUSPIRV64: target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-G1-P4-A0"
+// AMDGPUSPIRV64: target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n32:64-S32-G1-P4-A0"
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index bf5f297..9132cc8 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -638,7 +638,7 @@ void test_get_workgroup_size(int d, global int *out)
 
 // CHECK-LABEL: @test_get_grid_size(
 // CHECK: {{.*}}call align 4 dereferenceable(64){{.*}} ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-// CHECK: getelementptr inbounds i8, ptr addrspace(4) %{{.*}}, i64 %.sink
+// CHECK: getelementptr inbounds i8, ptr addrspace(4) %{{.*}}, i64 %{{.+}}
 // CHECK: load i32, ptr addrspace(4) %{{.*}}, align 4, !invariant.load
 void test_get_grid_size(int d, global int *out)
 {
diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip
index 0cdc82e..4a91c9d 100644
--- a/clang/test/Driver/hip-toolchain-no-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-no-rdc.hip
@@ -36,6 +36,11 @@
 // RUN:   %t/a.o %t/b.o \
 // RUN: 2>&1 | FileCheck -check-prefixes=LKONLY %s
 
+// RUN: %clang -### --target=x86_64-linux-gnu \
+// RUN:   --offload-arch=amdgcnspirv --offload-arch=gfx900 \
+// RUN:   %s -nogpuinc -nogpulib \
+// RUN: 2>&1 | FileCheck -check-prefixes=AMDGCNSPIRV %s
+
 //
 // Compile device code in a.cu to code object for gfx803.
 //
@@ -177,3 +182,16 @@
 // LKONLY-NOT: {{".*/llc"}}
 // LKONLY: [[LD:".*ld.*"]] {{.*}} "{{.*/a.o}}" "{{.*/b.o}}"
 // LKONLY-NOT: "-T" "{{.*}}.lk"
+
+//
+// Check mixed AMDGCNSPIRV and concrete GPU arch.
+//
+
+// AMDGCNSPIRV: "-cc1" "-triple" "spirv64-amd-amdhsa" {{.*}}"-emit-obj" {{.*}} "-o" "[[AMDGCNSPV_OBJ:.*o]]"
+// AMDGCNSPIRV: {{".*llvm-link.*"}} "-o" "[[AMDGCNSPV_TMP:.*out]]" "[[AMDGCNSPV_OBJ]]"
+// AMDGCNSPIRV: {{".*llvm-spirv.*"}} "--spirv-max-version=1.6" "--spirv-ext=+all" {{.*}} "[[AMDGCNSPV_TMP]]" {{.*}}"-o" "[[AMDGCNSPV_CO:.*out]]"
+// AMDGCNSPIRV: "-cc1" "-triple" "amdgcn-amd-amdhsa" {{.*}}"-emit-obj" {{.*}}"-target-cpu" "gfx900"{{.*}} "-o" "[[GFX900_OBJ:.*o]]"
+// AMDGCNSPIRV: {{".*lld.*"}} {{.*}}"-plugin-opt=mcpu=gfx900" {{.*}} "-o" "[[GFX900_CO:.*out]]" {{.*}}"[[GFX900_OBJ]]"
+// AMDGCNSPIRV: {{".*clang-offload-bundler.*"}} "-type=o"
+// AMDGCNSPIRV-SAME: "-targets={{.*}}hipv4-spirv64-amd-amdhsa--amdgcnspirv,hipv4-amdgcn-amd-amdhsa--gfx900"
+// AMDGCNSPIRV-SAME: "-input=[[AMDGCNSPV_CO]]" "-input=[[GFX900_CO]]"
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index e28b077..3f1977d 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -178,6 +178,7 @@
 // CHECK-NEXT: ReturnTypestate (SubjectMatchRule_function, SubjectMatchRule_variable_is_parameter)
 // CHECK-NEXT: ReturnsNonNull (SubjectMatchRule_objc_method, SubjectMatchRule_function)
 // CHECK-NEXT: ReturnsTwice (SubjectMatchRule_function)
+// CHECK-NEXT: SYCLKernelEntryPoint (SubjectMatchRule_function)
 // CHECK-NEXT: SYCLSpecialClass (SubjectMatchRule_record)
 // CHECK-NEXT: ScopedLockable (SubjectMatchRule_record)
 // CHECK-NEXT: Section (SubjectMatchRule_function, SubjectMatchRule_variable_is_global, SubjectMatchRule_objc_method, SubjectMatchRule_objc_property)
diff --git a/clang/test/Sema/attr-nonblocking-constraints.cpp b/clang/test/Sema/attr-nonblocking-constraints.cpp
index f23093d..19a4c3b7 100644
--- a/clang/test/Sema/attr-nonblocking-constraints.cpp
+++ b/clang/test/Sema/attr-nonblocking-constraints.cpp
@@ -388,6 +388,51 @@ void nb26() [[clang::nonblocking]] {
 	abort_wrapper(); // no diagnostic
 }
 
+// --- Make sure we don't traverse a requires clause. ---
+
+// Apparently some requires clauses are able to be collapsed into a constant before the nonblocking
+// analysis sees any function calls. This example (extracted from a real-world case where
+// `operator&&` in <valarray>, preceding the inclusion of <expected>) is sufficiently complex
+// to look like it contains function calls. There may be simpler examples.
+
+namespace ExpectedTest {
+
+template <class _Tp>
+inline constexpr bool is_copy_constructible_v = __is_constructible(_Tp, _Tp&);
+
+template <bool, class _Tp = void>
+struct enable_if {};
+template <class _Tp>
+struct enable_if<true, _Tp> {
+  typedef _Tp type;
+};
+
+template <bool _Bp, class _Tp = void>
+using enable_if_t = typename enable_if<_Bp, _Tp>::type;
+
+// Doesn't seem to matter whether the enable_if is true or false.
+template <class E1, class E2, enable_if_t<is_copy_constructible_v<E1>> = 0>
+inline bool operator&&(const E1& x, const E2& y);
+
+template <class _Tp, class _Err>
+class expected {
+public:
+  constexpr expected()
+    {}
+
+  constexpr expected(const expected&)
+    requires(is_copy_constructible_v<_Tp> && is_copy_constructible_v<_Err>)
+  = default;
+};
+
+void test() [[clang::nonblocking]]
+{
+	expected<int, int> a;
+	auto b = a;
+}
+
+} // namespace ExpectedTest
+
 // --- nonblocking implies noexcept ---
 #pragma clang diagnostic warning "-Wperf-constraint-implies-noexcept"
 
diff --git a/clang/test/SemaSYCL/sycl-kernel-entry-point-attr-grammar.cpp b/clang/test/SemaSYCL/sycl-kernel-entry-point-attr-grammar.cpp
new file mode 100644
index 0000000..c63d241
--- /dev/null
+++ b/clang/test/SemaSYCL/sycl-kernel-entry-point-attr-grammar.cpp
@@ -0,0 +1,137 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++17 -fsyntax-only -fsycl-is-device -verify %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++20 -fsyntax-only -fsycl-is-device -verify %s
+
+// These tests validate parsing of the sycl_kernel_entry_point argument list
+// and that the single argument names a type.
+
+// Templates used to exercise class template specializations.
+template<int> struct ST; // #ST-decl
+template<int N> using TTA = ST<N>; // #TTA-decl
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Valid declarations.
+////////////////////////////////////////////////////////////////////////////////
+
+struct S1;
+[[clang::sycl_kernel_entry_point(S1)]] void ok1();
+
+typedef struct {} TA2;
+[[clang::sycl_kernel_entry_point(TA2)]] void ok2();
+
+using TA3 = struct {};
+[[clang::sycl_kernel_entry_point(TA3)]] void ok3();
+
+[[clang::sycl_kernel_entry_point(ST<4>)]] void ok4();
+
+[[clang::sycl_kernel_entry_point(TTA<5>)]] void ok5();
+
+namespace NS6 {
+  struct NSS;
+}
+[[clang::sycl_kernel_entry_point(NS6::NSS)]] void ok6();
+
+namespace {
+  struct UNSS7;
+}
+[[clang::sycl_kernel_entry_point(UNSS7)]] void ok7();
+
+struct {} s;
+[[clang::sycl_kernel_entry_point(decltype(s))]] void ok8();
+
+template<typename KN>
+[[clang::sycl_kernel_entry_point(KN)]] void ok9();
+void test_ok9() {
+  ok9<struct LS1>();
+}
+
+template<int, typename KN>
+[[clang::sycl_kernel_entry_point(KN)]] void ok10();
+void test_ok10() {
+  ok10<1, struct LS2>();
+}
+
+namespace NS11 {
+  struct NSS;
+}
+template<typename T>
+[[clang::sycl_kernel_entry_point(T)]] void ok11() {}
+template<>
+[[clang::sycl_kernel_entry_point(NS11::NSS)]] void ok11<NS11::NSS>() {}
+
+struct S12;
+[[clang::sycl_kernel_entry_point(S12)]] void ok12();
+[[clang::sycl_kernel_entry_point(S12)]] void ok12() {}
+
+template<typename T>
+[[clang::sycl_kernel_entry_point(T)]] void ok13(T k);
+void test_ok13() {
+  ok13([]{});
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Invalid declarations.
+////////////////////////////////////////////////////////////////////////////////
+
+// expected-error@+1 {{'sycl_kernel_entry_point' attribute takes one argument}}
+[[clang::sycl_kernel_entry_point]] void bad1();
+
+// expected-error@+1 {{'sycl_kernel_entry_point' attribute takes one argument}}
+[[clang::sycl_kernel_entry_point()]] void bad2();
+
+struct B3;
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{expected ']'}}
+[[clang::sycl_kernel_entry_point(B3,)]] void bad3();
+
+struct B4;
+// expected-error@+3 {{expected ')'}}
+// expected-error@+2 {{expected ','}}
+// expected-warning@+1 {{unknown attribute 'X' ignored}}
+[[clang::sycl_kernel_entry_point(B4, X)]] void bad4();
+
+// expected-error@+1 {{expected a type}}
+[[clang::sycl_kernel_entry_point(1)]] void bad5();
+
+void f6();
+// expected-error@+1 {{unknown type name 'f6'}}
+[[clang::sycl_kernel_entry_point(f6)]] void bad6();
+
+// expected-error@+2 {{use of class template 'ST' requires template arguments; argument deduction not allowed here}}
+// expected-note@#ST-decl {{template is declared here}}
+[[clang::sycl_kernel_entry_point(ST)]] void bad7();
+
+// expected-error@+2 {{use of alias template 'TTA' requires template arguments; argument deduction not allowed here}}
+// expected-note@#TTA-decl {{template is declared here}}
+[[clang::sycl_kernel_entry_point(TTA)]] void bad8();
+
+enum {
+  e9
+};
+// expected-error@+1 {{unknown type name 'e9'}}
+[[clang::sycl_kernel_entry_point(e9)]] void bad9();
+
+#if __cplusplus >= 202002L
+template<typename> concept C = true;
+// expected-error@+1 {{expected a type}}
+[[clang::sycl_kernel_entry_point(C)]] void bad10();
+
+// expected-error@+1 {{expected a type}}
+[[clang::sycl_kernel_entry_point(C<int>)]] void bad11();
+#endif
+
+struct B12; // #B12-decl
+// FIXME: C++23 [temp.expl.spec]p12 states:
+// FIXME:   ... Similarly, attributes appearing in the declaration of a template
+// FIXME:   have no effect on an explicit specialization of that template.
+// FIXME: Clang currently instantiates and propagates attributes from a function
+// FIXME: template to its explicit specializations resulting in the following
+// FIXME: spurious error.
+// expected-error@+4 {{incomplete type 'B12' named in nested name specifier}}
+// expected-note@+5 {{in instantiation of function template specialization 'bad12<B12>' requested here}}
+// expected-note@#B12-decl {{forward declaration of 'B12'}}
+template<typename T>
+[[clang::sycl_kernel_entry_point(typename T::not_found)]] void bad12() {}
+template<>
+void bad12<B12>() {}
diff --git a/clang/test/SemaSYCL/sycl-kernel-entry-point-attr-ignored.cpp b/clang/test/SemaSYCL/sycl-kernel-entry-point-attr-ignored.cpp
new file mode 100644
index 0000000..30de6ae
--- /dev/null
+++ b/clang/test/SemaSYCL/sycl-kernel-entry-point-attr-ignored.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++17 -fsyntax-only -verify %s
+
+// These tests validate that the sycl_kernel_entry_point attribute is ignored
+// when SYCL support is not enabled.
+
+// A unique kernel name type is required for each declared kernel entry point.
+template<int> struct KN;
+
+// expected-warning@+1 {{'sycl_kernel_entry_point' attribute ignored}}
+[[clang::sycl_kernel_entry_point(KN<1>)]]
+void ok1();
+
+// expected-warning@+2 {{'sycl_kernel_entry_point' attribute ignored}}
+template<typename KNT>
+[[clang::sycl_kernel_entry_point(KNT)]]
+void ok2() {}
+template void ok2<KN<2>>();
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index 3031d81..5a80c8c 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -2727,7 +2727,8 @@ static void emitAttributes(const RecordKeeper &Records, raw_ostream &OS,
     }
 
     if (Header)
-      OS << "class " << R.getName() << "Attr : public " << SuperName << " {\n";
+      OS << "class CLANG_ABI " << R.getName() << "Attr : public " << SuperName
+         << " {\n";
     else
       OS << "\n// " << R.getName() << "Attr implementation\n\n";
 
@@ -3185,7 +3186,8 @@ void clang::EmitClangAttrClass(const RecordKeeper &Records, raw_ostream &OS) {
   emitSourceFileHeader("Attribute classes' definitions", OS, Records);
 
   OS << "#ifndef LLVM_CLANG_ATTR_CLASSES_INC\n";
-  OS << "#define LLVM_CLANG_ATTR_CLASSES_INC\n\n";
+  OS << "#define LLVM_CLANG_ATTR_CLASSES_INC\n";
+  OS << "#include \"clang/Support/Compiler.h\"\n\n";
 
   emitAttributes(Records, OS, true);
 
diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index 431f544..6d52eec 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -37,7 +37,11 @@ check_c_compiler_flag(-nodefaultlibs C_SUPPORTS_NODEFAULTLIBS_FLAG)
 if (C_SUPPORTS_NODEFAULTLIBS_FLAG)
   set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -nodefaultlibs")
   if (COMPILER_RT_HAS_LIBC)
-    list(APPEND CMAKE_REQUIRED_LIBRARIES c)
+    if (HAIKU)
+      list(APPEND CMAKE_REQUIRED_LIBRARIES root)
+    else()
+      list(APPEND CMAKE_REQUIRED_LIBRARIES c)
+    endif()
   endif ()
   if (COMPILER_RT_USE_BUILTINS_LIBRARY)
     # TODO: remote this check once we address PR51389.
@@ -826,7 +830,7 @@ else()
 endif()
 
 if (PROFILE_SUPPORTED_ARCH AND NOT LLVM_USE_SANITIZER AND
-    OS_NAME MATCHES "Darwin|Linux|FreeBSD|Windows|Android|Fuchsia|SunOS|NetBSD|AIX|WASI")
+    OS_NAME MATCHES "Darwin|Linux|FreeBSD|Windows|Android|Fuchsia|SunOS|NetBSD|AIX|WASI|Haiku")
   set(COMPILER_RT_HAS_PROFILE TRUE)
 else()
   set(COMPILER_RT_HAS_PROFILE FALSE)
diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp
index 077a536..cfd9261 100644
--- a/compiler-rt/lib/interception/interception_win.cpp
+++ b/compiler-rt/lib/interception/interception_win.cpp
@@ -187,8 +187,12 @@ static uptr GetMmapGranularity() {
   return si.dwAllocationGranularity;
 }
 
+UNUSED static uptr RoundDownTo(uptr size, uptr boundary) {
+  return size & ~(boundary - 1);
+}
+
 UNUSED static uptr RoundUpTo(uptr size, uptr boundary) {
-  return (size + boundary - 1) & ~(boundary - 1);
+  return RoundDownTo(size + boundary - 1, boundary);
 }
 
 // FIXME: internal_str* and internal_mem* functions should be moved from the
@@ -285,8 +289,11 @@ static void WriteJumpInstruction(uptr from, uptr target) {
 
 static void WriteShortJumpInstruction(uptr from, uptr target) {
   sptr offset = target - from - kShortJumpInstructionLength;
-  if (offset < -128 || offset > 127)
+  if (offset < -128 || offset > 127) {
+    ReportError("interception_win: cannot write short jmp from %p to %p\n",
+                (void *)from, (void *)target);
     InterceptionFailed();
+  }
   *(u8*)from = 0xEB;
   *(u8*)(from + 1) = (u8)offset;
 }
@@ -340,32 +347,78 @@ struct TrampolineMemoryRegion {
   uptr max_size;
 };
 
-UNUSED static const uptr kTrampolineScanLimitRange = 1ull << 31;  // 2 gig
+UNUSED static const uptr kTrampolineRangeLimit = 1ull << 31;  // 2 gig
 static const int kMaxTrampolineRegion = 1024;
 static TrampolineMemoryRegion TrampolineRegions[kMaxTrampolineRegion];
 
-static void *AllocateTrampolineRegion(uptr image_address, size_t granularity) {
-#if SANITIZER_WINDOWS64
-  uptr address = image_address;
-  uptr scanned = 0;
-  while (scanned < kTrampolineScanLimitRange) {
+static void *AllocateTrampolineRegion(uptr min_addr, uptr max_addr,
+                                      uptr func_addr, size_t granularity) {
+#  if SANITIZER_WINDOWS64
+  // Clamp {min,max}_addr to the accessible address space.
+  SYSTEM_INFO system_info;
+  ::GetSystemInfo(&system_info);
+  uptr min_virtual_addr =
+      RoundUpTo((uptr)system_info.lpMinimumApplicationAddress, granularity);
+  uptr max_virtual_addr =
+      RoundDownTo((uptr)system_info.lpMaximumApplicationAddress, granularity);
+  if (min_addr < min_virtual_addr)
+    min_addr = min_virtual_addr;
+  if (max_addr > max_virtual_addr)
+    max_addr = max_virtual_addr;
+
+  // This loop probes the virtual address space to find free memory in the
+  // [min_addr, max_addr] interval. The search starts from func_addr and
+  // proceeds "outwards" towards the interval bounds using two probes, lo_addr
+  // and hi_addr, for addresses lower/higher than func_addr. At each step, it
+  // considers the probe closest to func_addr. If that address is not free, the
+  // probe is advanced (lower or higher depending on the probe) to the next
+  // memory block and the search continues.
+  uptr lo_addr = RoundDownTo(func_addr, granularity);
+  uptr hi_addr = RoundUpTo(func_addr, granularity);
+  while (lo_addr >= min_addr || hi_addr <= max_addr) {
+    // Consider the in-range address closest to func_addr.
+    uptr addr;
+    if (lo_addr < min_addr)
+      addr = hi_addr;
+    else if (hi_addr > max_addr)
+      addr = lo_addr;
+    else
+      addr = (hi_addr - func_addr < func_addr - lo_addr) ? hi_addr : lo_addr;
+
     MEMORY_BASIC_INFORMATION info;
-    if (!::VirtualQuery((void*)address, &info, sizeof(info)))
+    if (!::VirtualQuery((void *)addr, &info, sizeof(info))) {
+      ReportError(
+          "interception_win: VirtualQuery in AllocateTrampolineRegion failed "
+          "for %p\n",
+          (void *)addr);
       return nullptr;
+    }
 
-    // Check whether a region can be allocated at |address|.
+    // Check whether a region can be allocated at |addr|.
     if (info.State == MEM_FREE && info.RegionSize >= granularity) {
-      void *page = ::VirtualAlloc((void*)RoundUpTo(address, granularity),
-                                  granularity,
-                                  MEM_RESERVE | MEM_COMMIT,
-                                  PAGE_EXECUTE_READWRITE);
+      void *page =
+          ::VirtualAlloc((void *)addr, granularity, MEM_RESERVE | MEM_COMMIT,
+                         PAGE_EXECUTE_READWRITE);
+      if (page == nullptr)
+        ReportError(
+            "interception_win: VirtualAlloc in AllocateTrampolineRegion failed "
+            "for %p\n",
+            (void *)addr);
       return page;
     }
 
-    // Move to the next region.
-    address = (uptr)info.BaseAddress + info.RegionSize;
-    scanned += info.RegionSize;
+    if (addr == lo_addr)
+      lo_addr =
+          RoundDownTo((uptr)info.AllocationBase - granularity, granularity);
+    if (addr == hi_addr)
+      hi_addr =
+          RoundUpTo((uptr)info.BaseAddress + info.RegionSize, granularity);
   }
+
+  ReportError(
+      "interception_win: AllocateTrampolineRegion failed to find free memory; "
+      "min_addr: %p, max_addr: %p, func_addr: %p, granularity: %zu\n",
+      (void *)min_addr, (void *)max_addr, granularity);
   return nullptr;
 #else
   return ::VirtualAlloc(nullptr,
@@ -387,17 +440,17 @@ void TestOnlyReleaseTrampolineRegions() {
 }
 
 static uptr AllocateMemoryForTrampoline(uptr func_address, size_t size) {
-  uptr image_address = func_address;
+#  if SANITIZER_WINDOWS64
+  uptr min_addr = func_address - kTrampolineRangeLimit;
+  uptr max_addr = func_address + kTrampolineRangeLimit - size;
 
-#if SANITIZER_WINDOWS64
-  // Allocate memory after the module (DLL or EXE file), but within 2GB
-  // of the start of the module so that any address within the module can be
-  // referenced with PC-relative operands.
+  // Allocate memory within 2GB of the module (DLL or EXE file) so that any
+  // address within the module can be referenced with PC-relative operands.
   // This allows us to not just jump to the trampoline with a PC-relative
   // offset, but to relocate any instructions that we copy to the trampoline
   // which have references to the original module. If we can't find the base
   // address of the module (e.g. if func_address is in mmap'ed memory), just
-  // use func_address as is.
+  // stay within 2GB of func_address.
   HMODULE module;
   if (::GetModuleHandleExW(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
                            GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
@@ -405,19 +458,32 @@ static uptr AllocateMemoryForTrampoline(uptr func_address, size_t size) {
     MODULEINFO module_info;
     if (::GetModuleInformation(::GetCurrentProcess(), module,
                                 &module_info, sizeof(module_info))) {
-      image_address = (uptr)module_info.lpBaseOfDll;
+      min_addr = (uptr)module_info.lpBaseOfDll + module_info.SizeOfImage -
+                 kTrampolineRangeLimit;
+      max_addr = (uptr)module_info.lpBaseOfDll + kTrampolineRangeLimit - size;
     }
   }
-#endif
 
-  // Find a region within 2G with enough space to allocate |size| bytes.
+  // Check for overflow.
+  if (min_addr > func_address)
+    min_addr = 0;
+  if (max_addr < func_address)
+    max_addr = ~(uptr)0;
+#  else
+  uptr min_addr = 0;
+  uptr max_addr = ~min_addr;
+#  endif
+
+  // Find a region within [min_addr,max_addr] with enough space to allocate
+  // |size| bytes.
   TrampolineMemoryRegion *region = nullptr;
   for (size_t bucket = 0; bucket < kMaxTrampolineRegion; ++bucket) {
     TrampolineMemoryRegion* current = &TrampolineRegions[bucket];
     if (current->content == 0) {
       // No valid region found, allocate a new region.
       size_t bucket_size = GetMmapGranularity();
-      void *content = AllocateTrampolineRegion(image_address, bucket_size);
+      void *content = AllocateTrampolineRegion(min_addr, max_addr, func_address,
+                                               bucket_size);
       if (content == nullptr)
         return 0U;
 
@@ -427,13 +493,9 @@ static uptr AllocateMemoryForTrampoline(uptr func_address, size_t size) {
       region = current;
       break;
     } else if (current->max_size - current->allocated_size > size) {
-#if SANITIZER_WINDOWS64
-        // In 64-bits, the memory space must be allocated within 2G boundary.
-        uptr next_address = current->content + current->allocated_size;
-        if (next_address < image_address ||
-            next_address - image_address >= 0x7FFF0000)
-          continue;
-#endif
+      uptr next_address = current->content + current->allocated_size;
+      if (next_address < min_addr || next_address > max_addr)
+        continue;
       // The space can be allocated in the current region.
       region = current;
       break;
@@ -872,8 +934,14 @@ static bool CopyInstructions(uptr to, uptr from, size_t size) {
       // this will be untrue if relocated_offset \notin [-2**31, 2**31)
       s64 delta = to - from;
       s64 relocated_offset = *(s32 *)(to + cursor + rel_offset) - delta;
-      if (-0x8000'0000ll > relocated_offset || relocated_offset > 0x7FFF'FFFFll)
+      if (-0x8000'0000ll > relocated_offset ||
+          relocated_offset > 0x7FFF'FFFFll) {
+        ReportError(
+            "interception_win: CopyInstructions relocated_offset %lld outside "
+            "32-bit range\n",
+            (long long)relocated_offset);
         return false;
+      }
 #  else
       // on 32-bit, the relative offset will always be correct
       s32 delta = to - from;
@@ -1167,19 +1235,27 @@ uptr InternalGetProcAddress(void *module, const char *func_name) {
         // exported directory.
         char function_name[256];
         size_t funtion_name_length = _strlen(func);
-        if (funtion_name_length >= sizeof(function_name) - 1)
+        if (funtion_name_length >= sizeof(function_name) - 1) {
+          ReportError("interception_win: func too long: '%s'\n", func);
           InterceptionFailed();
+        }
 
         _memcpy(function_name, func, funtion_name_length);
         function_name[funtion_name_length] = '\0';
         char* separator = _strchr(function_name, '.');
-        if (!separator)
+        if (!separator) {
+          ReportError("interception_win: no separator in '%s'\n",
+                      function_name);
           InterceptionFailed();
+        }
         *separator = '\0';
 
         void* redirected_module = GetModuleHandleA(function_name);
-        if (!redirected_module)
+        if (!redirected_module) {
+          ReportError("interception_win: GetModuleHandleA failed for '%s'\n",
+                      function_name);
           InterceptionFailed();
+        }
         return InternalGetProcAddress(redirected_module, separator + 1);
       }
 
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
index e2c06d5..613cfb6 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
@@ -8,7 +8,7 @@
 
 #if defined(__linux__) || defined(__FreeBSD__) || defined(__Fuchsia__) ||      \
     (defined(__sun__) && defined(__svr4__)) || defined(__NetBSD__) ||          \
-    defined(_AIX) || defined(__wasm__)
+    defined(_AIX) || defined(__wasm__) || defined(__HAIKU__)
 
 #if !defined(_AIX) && !defined(__wasm__)
 #include <elf.h>
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformOther.c b/compiler-rt/lib/profile/InstrProfilingPlatformOther.c
index 52e8227..29e570b 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformOther.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformOther.c
@@ -9,7 +9,7 @@
 #if !defined(__APPLE__) && !defined(__linux__) && !defined(__FreeBSD__) &&     \
     !defined(__Fuchsia__) && !(defined(__sun__) && defined(__svr4__)) &&       \
     !defined(__NetBSD__) && !defined(_WIN32) && !defined(_AIX) &&              \
-    !defined(__wasm__)
+    !defined(__wasm__) && !defined(__HAIKU__)
 
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
index b02be0b..5524197 100644
--- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
@@ -15,12 +15,10 @@
 
 #include "interception/interception.h"
 #include "sanitizer_common/sanitizer_allocator_dlsym.h"
-#include "sanitizer_common/sanitizer_allocator_internal.h"
 #include "sanitizer_common/sanitizer_platform_interceptors.h"
 
 #include "interception/interception.h"
 #include "rtsan/rtsan.h"
-#include "rtsan/rtsan_context.h"
 
 #if SANITIZER_APPLE
 
@@ -33,11 +31,11 @@ extern "C" {
 typedef int32_t OSSpinLock;
 void OSSpinLockLock(volatile OSSpinLock *__lock);
 }
-#endif
+#endif // TARGET_OS_MAC
 
 #include <libkern/OSAtomic.h>
 #include <os/lock.h>
-#endif
+#endif // SANITIZER_APPLE
 
 #if SANITIZER_INTERCEPT_MEMALIGN || SANITIZER_INTERCEPT_PVALLOC
 #include <malloc.h>
diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
index 8ed933f..1850f4f 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
@@ -28,11 +28,6 @@
 #include <malloc.h>
 #endif
 
-#include <atomic>
-#include <chrono>
-#include <string>
-#include <thread>
-
 #include <fcntl.h>
 #include <pthread.h>
 #include <stdio.h>
@@ -41,23 +36,10 @@
 #include <sys/uio.h>
 
 #if _FILE_OFFSET_BITS == 64 && SANITIZER_GLIBC
-const char *const kCreatFunctionName = "creat64";
-const char *const kFcntlFunctionName = "fcntl64";
-const char *const kFopenFunctionName = "fopen64";
-const char *const kOpenAtFunctionName = "openat64";
-const char *const kOpenFunctionName = "open64";
-const char *const kPreadFunctionName = "pread64";
-const char *const kPwriteFunctionName = "pwrite64";
-const char *const kMmapFunctionName = "mmap64";
+// Under these conditions, some system calls are `foo64` instead of `foo`
+#define MAYBE_APPEND_64(func) func "64"
 #else
-const char *const kCreatFunctionName = "creat";
-const char *const kFcntlFunctionName = "fcntl";
-const char *const kFopenFunctionName = "fopen";
-const char *const kOpenAtFunctionName = "openat";
-const char *const kOpenFunctionName = "open";
-const char *const kPreadFunctionName = "pread";
-const char *const kPwriteFunctionName = "pwrite";
-const char *const kMmapFunctionName = "mmap";
+#define MAYBE_APPEND_64(func) func
 #endif
 
 using namespace testing;
@@ -187,7 +169,7 @@ TEST(TestRtsanInterceptors, MmapDiesWhenRealtime) {
     void *_ = mmap(nullptr, 8, PROT_READ | PROT_WRITE,
                    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
   };
-  ExpectRealtimeDeath(Func, kMmapFunctionName);
+  ExpectRealtimeDeath(Func, MAYBE_APPEND_64("mmap"));
   ExpectNonRealtimeSurvival(Func);
 }
 
@@ -244,13 +226,13 @@ TEST(TestRtsanInterceptors, NanosleepDiesWhenRealtime) {
 
 TEST_F(RtsanFileTest, OpenDiesWhenRealtime) {
   auto Func = [this]() { open(GetTemporaryFilePath(), O_RDONLY); };
-  ExpectRealtimeDeath(Func, kOpenFunctionName);
+  ExpectRealtimeDeath(Func, MAYBE_APPEND_64("open"));
   ExpectNonRealtimeSurvival(Func);
 }
 
 TEST_F(RtsanFileTest, OpenatDiesWhenRealtime) {
   auto Func = [this]() { openat(0, GetTemporaryFilePath(), O_RDONLY); };
-  ExpectRealtimeDeath(Func, kOpenAtFunctionName);
+  ExpectRealtimeDeath(Func, MAYBE_APPEND_64("openat"));
   ExpectNonRealtimeSurvival(Func);
 }
 
@@ -275,13 +257,13 @@ TEST_F(RtsanFileTest, OpenCreatesFileWithProperMode) {
 
 TEST_F(RtsanFileTest, CreatDiesWhenRealtime) {
   auto Func = [this]() { creat(GetTemporaryFilePath(), S_IWOTH | S_IROTH); };
-  ExpectRealtimeDeath(Func, kCreatFunctionName);
+  ExpectRealtimeDeath(Func, MAYBE_APPEND_64("creat"));
   ExpectNonRealtimeSurvival(Func);
 }
 
 TEST(TestRtsanInterceptors, FcntlDiesWhenRealtime) {
   auto Func = []() { fcntl(0, F_GETFL); };
-  ExpectRealtimeDeath(Func, kFcntlFunctionName);
+  ExpectRealtimeDeath(Func, MAYBE_APPEND_64("fcntl"));
   ExpectNonRealtimeSurvival(Func);
 }
 
@@ -300,7 +282,7 @@ TEST_F(RtsanFileTest, FcntlFlockDiesWhenRealtime) {
     ASSERT_THAT(fcntl(fd, F_GETLK, &lock), Eq(0));
     ASSERT_THAT(lock.l_type, F_UNLCK);
   };
-  ExpectRealtimeDeath(Func, kFcntlFunctionName);
+  ExpectRealtimeDeath(Func, MAYBE_APPEND_64("fcntl"));
   ExpectNonRealtimeSurvival(Func);
 
   close(fd);
@@ -322,7 +304,7 @@ TEST_F(RtsanFileTest, FcntlSetFdDiesWhenRealtime) {
     ASSERT_THAT(fcntl(fd, F_GETFD), Eq(old_flags));
   };
 
-  ExpectRealtimeDeath(Func, kFcntlFunctionName);
+  ExpectRealtimeDeath(Func, MAYBE_APPEND_64("fcntl"));
   ExpectNonRealtimeSurvival(Func);
 
   close(fd);
@@ -340,7 +322,7 @@ TEST_F(RtsanFileTest, FopenDiesWhenRealtime) {
     EXPECT_THAT(f, Ne(nullptr));
   };
 
-  ExpectRealtimeDeath(Func, kFopenFunctionName);
+  ExpectRealtimeDeath(Func, MAYBE_APPEND_64("fopen"));
   ExpectNonRealtimeSurvival(Func);
 }
 
@@ -428,7 +410,7 @@ TEST_F(RtsanOpenedFileTest, PreadDiesWhenRealtime) {
     char c{};
     pread(GetOpenFd(), &c, 1, 0);
   };
-  ExpectRealtimeDeath(Func, kPreadFunctionName);
+  ExpectRealtimeDeath(Func, MAYBE_APPEND_64("pread"));
   ExpectNonRealtimeSurvival(Func);
 }
 
@@ -447,7 +429,7 @@ TEST_F(RtsanOpenedFileTest, PwriteDiesWhenRealtime) {
     char c = 'a';
     pwrite(GetOpenFd(), &c, 1, 0);
   };
-  ExpectRealtimeDeath(Func, kPwriteFunctionName);
+  ExpectRealtimeDeath(Func, MAYBE_APPEND_64("pwrite"));
   ExpectNonRealtimeSurvival(Func);
 }
 
diff --git a/compiler-rt/test/builtins/Unit/ctor_dtor.c b/compiler-rt/test/builtins/Unit/ctor_dtor.c
index 4756072..58dffba 100644
--- a/compiler-rt/test/builtins/Unit/ctor_dtor.c
+++ b/compiler-rt/test/builtins/Unit/ctor_dtor.c
@@ -1,7 +1,7 @@
 // REQUIRES: crt
 
 // RUN: %clang -fno-use-init-array -g -c %s -o %t.o
-// RUN: %clang -o %t -no-pie -nostdlib %crt1 %crti %crtbegin %t.o -lc %libgcc %crtend %crtn
+// RUN: %clang -o %t -no-pie -nostdlib %crt1 %crti %crtbegin %t.o %libc %libgcc %crtend %crtn
 // RUN: %run %t 2>&1 | FileCheck %s
 
 #include <stdio.h>
diff --git a/compiler-rt/test/builtins/Unit/dso_handle.cpp b/compiler-rt/test/builtins/Unit/dso_handle.cpp
index 7967469..183e29b 100644
--- a/compiler-rt/test/builtins/Unit/dso_handle.cpp
+++ b/compiler-rt/test/builtins/Unit/dso_handle.cpp
@@ -2,8 +2,8 @@
 
 // RUN: %clangxx -g -fno-exceptions -DCRT_SHARED -c %s -fPIC -o %tshared.o
 // RUN: %clangxx -g -fno-exceptions -c %s -fPIC -o %t.o
-// RUN: %clangxx -g -shared -o %t.so -nostdlib %crti %crtbegin %tshared.o %libstdcxx -lc -lm %libgcc %crtend %crtn
-// RUN: %clangxx -g -o %t -fno-pic -no-pie -nostdlib %crt1 %crti %crtbegin %t.o %libstdcxx -lc -lm %libgcc %t.so %crtend %crtn
+// RUN: %clangxx -g -shared -o %t.so -nostdlib %crti %crtbegin %tshared.o %libstdcxx %libc -lm %libgcc %crtend %crtn
+// RUN: %clangxx -g -o %t -fno-pic -no-pie -nostdlib %crt1 %crti %crtbegin %t.o %libstdcxx %libc -lm %libgcc %t.so %crtend %crtn
 // RUN: %run %t 2>&1 | FileCheck %s
 
 // UNSUPPORTED: target={{(arm|aarch64).*}}
diff --git a/compiler-rt/test/builtins/Unit/lit.cfg.py b/compiler-rt/test/builtins/Unit/lit.cfg.py
index c18c973..c030f89 100644
--- a/compiler-rt/test/builtins/Unit/lit.cfg.py
+++ b/compiler-rt/test/builtins/Unit/lit.cfg.py
@@ -104,7 +104,10 @@ else:
     if sys.platform in ["win32"] and execute_external:
         # Don't pass dosish path separator to msys bash.exe.
         base_lib = base_lib.replace("\\", "/")
-    config.substitutions.append(("%librt ", base_lib + " -lc -lm "))
+    if config.host_os == "Haiku":
+        config.substitutions.append(("%librt ", base_lib + " -lroot "))
+    else:
+        config.substitutions.append(("%librt ", base_lib + " -lc -lm "))
 
 builtins_build_crt = get_required_attr(config, "builtins_build_crt")
 if builtins_build_crt:
@@ -123,6 +126,9 @@ if builtins_build_crt:
     config.substitutions.append(("%crtn", get_library_path("crtn.o")))
 
     config.substitutions.append(("%libgcc", get_libgcc_file_name()))
+    config.substitutions.append(
+        ("%libc", "-lroot" if sys.platform.startswith("haiku") else "-lc")
+    )
 
     config.substitutions.append(
         ("%libstdcxx", "-l" + config.sanitizer_cxx_lib.lstrip("lib"))
diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index 4bbce4c..c6f2774 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -82,6 +82,8 @@ def push_dynamic_library_lookup_path(config, new_path):
         dynamic_library_lookup_var = "PATH"
     elif platform.system() == "Darwin":
         dynamic_library_lookup_var = "DYLD_LIBRARY_PATH"
+    elif platform.system() == "Haiku":
+        dynamic_library_lookup_var = "LIBRARY_PATH"
     else:
         dynamic_library_lookup_var = "LD_LIBRARY_PATH"
 
@@ -275,7 +277,6 @@ possibly_dangerous_env_vars = [
     "COMPILER_PATH",
     "RC_DEBUG_OPTIONS",
     "CINDEXTEST_PREAMBLE_FILE",
-    "LIBRARY_PATH",
     "CPATH",
     "C_INCLUDE_PATH",
     "CPLUS_INCLUDE_PATH",
diff --git a/compiler-rt/test/profile/Posix/gcov-destructor.c b/compiler-rt/test/profile/Posix/gcov-destructor.c
index bd1e0d2..1f9412f0 100644
--- a/compiler-rt/test/profile/Posix/gcov-destructor.c
+++ b/compiler-rt/test/profile/Posix/gcov-destructor.c
@@ -1,4 +1,5 @@
 /// Test that destructors and destructors whose priorities are greater than 100 are tracked.
+// XFAIL: target={{.*haiku.*}}
 // RUN: mkdir -p %t.dir && cd %t.dir
 // RUN: %clang --coverage %s -o %t -dumpdir ./
 // RUN: rm -f gcov-destructor.gcda && %run %t
diff --git a/compiler-rt/test/profile/Posix/gcov-dlopen.c b/compiler-rt/test/profile/Posix/gcov-dlopen.c
index ceac6ac..72f1118 100644
--- a/compiler-rt/test/profile/Posix/gcov-dlopen.c
+++ b/compiler-rt/test/profile/Posix/gcov-dlopen.c
@@ -1,5 +1,6 @@
 /// atexit(3) not supported in dlopen(3)ed+dlclose(3)d DSO
 // XFAIL: target={{.*netbsd.*}}
+// XFAIL: target={{.*haiku.*}}
 
 // RUN: mkdir -p %t.d && cd %t.d
 
diff --git a/compiler-rt/test/profile/Posix/instrprof-dlopen-norpath.test b/compiler-rt/test/profile/Posix/instrprof-dlopen-norpath.test
index 0d75018..1c98969 100644
--- a/compiler-rt/test/profile/Posix/instrprof-dlopen-norpath.test
+++ b/compiler-rt/test/profile/Posix/instrprof-dlopen-norpath.test
@@ -1,3 +1,4 @@
+XFAIL: target={{.*haiku.*}}
 RUN: rm -rf %t && split-file %s %t && cd %t
 RUN: %clang_pgogen -fprofile-update=atomic -fPIC foo.c -c -Xclang -fprofile-instrument-path="default_foo_%m.profraw"
 RUN: %clang_pgogen -fprofile-update=atomic -fPIC foo2.c -c -Xclang -fprofile-instrument-path="default_foo2_%m.profraw"
diff --git a/compiler-rt/test/profile/instrprof-error.c b/compiler-rt/test/profile/instrprof-error.c
index 3297c9d8..a49d238 100644
--- a/compiler-rt/test/profile/instrprof-error.c
+++ b/compiler-rt/test/profile/instrprof-error.c
@@ -1,3 +1,4 @@
+// XFAIL: target={{.*haiku.*}}
 // RUN: %clang_profgen -o %t -O3 %s
 // RUN: env LLVM_PROFILE_FILE=%t/  %run %t 1 2>&1 | FileCheck %s
 
diff --git a/compiler-rt/test/profile/lit.cfg.py b/compiler-rt/test/profile/lit.cfg.py
index ca6df08..bb5e28d 100644
--- a/compiler-rt/test/profile/lit.cfg.py
+++ b/compiler-rt/test/profile/lit.cfg.py
@@ -162,6 +162,7 @@ if config.host_os not in [
     "NetBSD",
     "SunOS",
     "AIX",
+    "Haiku",
 ]:
     config.unsupported = True
 
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 2f66aad..d2c5b45 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -3447,7 +3447,8 @@ WRAPPER_CLASS(OmpObjectList, std::list<OmpObject>);
 //    MUTEXINOUTSET | DEPOBJ |  // since 5.0
 //    INOUTSET                  // since 5.2
 struct OmpTaskDependenceType {
-  ENUM_CLASS(Type, In, Out, Inout, Source, Sink, Depobj)
+  ENUM_CLASS(
+      Type, In, Out, Inout, Inoutset, Mutexinoutset, Source, Sink, Depobj)
   WRAPPER_CLASS_BOILERPLATE(OmpTaskDependenceType, Type);
 };
 
diff --git a/flang/include/flang/Runtime/CUDA/memory.h b/flang/include/flang/Runtime/CUDA/memory.h
index 51d6b8d..4ac2528 100644
--- a/flang/include/flang/Runtime/CUDA/memory.h
+++ b/flang/include/flang/Runtime/CUDA/memory.h
@@ -35,11 +35,6 @@ void RTDECL(CUFMemsetDescriptor)(Descriptor *desc, void *value,
 void RTDECL(CUFDataTransferPtrPtr)(void *dst, void *src, std::size_t bytes,
     unsigned mode, const char *sourceFile = nullptr, int sourceLine = 0);
 
-/// Data transfer from a pointer to a descriptor.
-void RTDECL(CUFDataTransferDescPtr)(Descriptor *dst, void *src,
-    std::size_t bytes, unsigned mode, const char *sourceFile = nullptr,
-    int sourceLine = 0);
-
 /// Data transfer from a descriptor to a pointer.
 void RTDECL(CUFDataTransferPtrDesc)(void *dst, Descriptor *src,
     std::size_t bytes, unsigned mode, const char *sourceFile = nullptr,
diff --git a/flang/lib/Lower/DirectivesCommon.h b/flang/lib/Lower/DirectivesCommon.h
index 421a44b..88514b1 100644
--- a/flang/lib/Lower/DirectivesCommon.h
+++ b/flang/lib/Lower/DirectivesCommon.h
@@ -179,7 +179,11 @@ static inline void genOmpAccAtomicWriteStatement(
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
   mlir::Type varType = fir::unwrapRefType(lhsAddr.getType());
+  // Create a conversion outside the capture block.
+  auto insertionPoint = firOpBuilder.saveInsertionPoint();
+  firOpBuilder.setInsertionPointAfter(rhsExpr.getDefiningOp());
   rhsExpr = firOpBuilder.createConvert(loc, varType, rhsExpr);
+  firOpBuilder.restoreInsertionPoint(insertionPoint);
 
   processOmpAtomicTODO<AtomicListT>(varType, loc);
 
@@ -410,10 +414,6 @@ void genOmpAccAtomicRead(Fortran::lower::AbstractConverter &converter,
       fir::getBase(converter.genExprAddr(fromExpr, stmtCtx));
   mlir::Value toAddress = fir::getBase(converter.genExprAddr(
       *Fortran::semantics::GetExpr(assignmentStmtVariable), stmtCtx));
-  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  if (fromAddress.getType() != toAddress.getType())
-    fromAddress =
-        builder.create<fir::ConvertOp>(loc, toAddress.getType(), fromAddress);
   genOmpAccAtomicCaptureStatement(converter, fromAddress, toAddress,
                                   leftHandClauseList, rightHandClauseList,
                                   elementType, loc);
@@ -497,23 +497,12 @@ void genOmpAccAtomicCapture(Fortran::lower::AbstractConverter &converter,
   // a `atomic.read`, `atomic.write`, or `atomic.update` operation
   // inside `atomic.capture`
   Fortran::lower::StatementContext stmtCtx;
-  mlir::Value stmt1LHSArg, stmt1RHSArg, stmt2LHSArg, stmt2RHSArg;
-  mlir::Type elementType;
   // LHS evaluations are common to all combinations of `atomic.capture`
-  stmt1LHSArg = fir::getBase(converter.genExprAddr(assign1.lhs, stmtCtx));
-  stmt2LHSArg = fir::getBase(converter.genExprAddr(assign2.lhs, stmtCtx));
+  mlir::Value stmt1LHSArg =
+      fir::getBase(converter.genExprAddr(assign1.lhs, stmtCtx));
+  mlir::Value stmt2LHSArg =
+      fir::getBase(converter.genExprAddr(assign2.lhs, stmtCtx));
 
-  // Operation specific RHS evaluations
-  if (Fortran::semantics::checkForSingleVariableOnRHS(stmt1)) {
-    // Atomic capture construct is of the form [capture-stmt, update-stmt] or
-    // of the form [capture-stmt, write-stmt]
-    stmt1RHSArg = fir::getBase(converter.genExprAddr(assign1.rhs, stmtCtx));
-    stmt2RHSArg = fir::getBase(converter.genExprValue(assign2.rhs, stmtCtx));
-  } else {
-    // Atomic capture construct is of the form [update-stmt, capture-stmt]
-    stmt1RHSArg = fir::getBase(converter.genExprValue(assign1.rhs, stmtCtx));
-    stmt2RHSArg = fir::getBase(converter.genExprAddr(assign2.lhs, stmtCtx));
-  }
   // Type information used in generation of `atomic.update` operation
   mlir::Type stmt1VarType =
       fir::getBase(converter.genExprValue(assign1.lhs, stmtCtx)).getType();
@@ -545,44 +534,46 @@ void genOmpAccAtomicCapture(Fortran::lower::AbstractConverter &converter,
       // Atomic capture construct is of the form [capture-stmt, update-stmt]
       const Fortran::semantics::SomeExpr &fromExpr =
           *Fortran::semantics::GetExpr(stmt1Expr);
-      elementType = converter.genType(fromExpr);
+      mlir::Type elementType = converter.genType(fromExpr);
       genOmpAccAtomicCaptureStatement<AtomicListT>(
-          converter, stmt1RHSArg, stmt1LHSArg,
+          converter, stmt2LHSArg, stmt1LHSArg,
           /*leftHandClauseList=*/nullptr,
           /*rightHandClauseList=*/nullptr, elementType, loc);
       genOmpAccAtomicUpdateStatement<AtomicListT>(
-          converter, stmt1RHSArg, stmt2VarType, stmt2Var, stmt2Expr,
+          converter, stmt2LHSArg, stmt2VarType, stmt2Var, stmt2Expr,
           /*leftHandClauseList=*/nullptr,
           /*rightHandClauseList=*/nullptr, loc, atomicCaptureOp);
     } else {
       // Atomic capture construct is of the form [capture-stmt, write-stmt]
+      firOpBuilder.setInsertionPoint(atomicCaptureOp);
+      mlir::Value stmt2RHSArg =
+          fir::getBase(converter.genExprValue(assign2.rhs, stmtCtx));
+      firOpBuilder.setInsertionPointToStart(&block);
       const Fortran::semantics::SomeExpr &fromExpr =
           *Fortran::semantics::GetExpr(stmt1Expr);
-      elementType = converter.genType(fromExpr);
+      mlir::Type elementType = converter.genType(fromExpr);
       genOmpAccAtomicCaptureStatement<AtomicListT>(
-          converter, stmt1RHSArg, stmt1LHSArg,
+          converter, stmt2LHSArg, stmt1LHSArg,
           /*leftHandClauseList=*/nullptr,
           /*rightHandClauseList=*/nullptr, elementType, loc);
       genOmpAccAtomicWriteStatement<AtomicListT>(
-          converter, stmt1RHSArg, stmt2RHSArg,
+          converter, stmt2LHSArg, stmt2RHSArg,
           /*leftHandClauseList=*/nullptr,
           /*rightHandClauseList=*/nullptr, loc);
     }
   } else {
     // Atomic capture construct is of the form [update-stmt, capture-stmt]
-    firOpBuilder.setInsertionPointToEnd(&block);
     const Fortran::semantics::SomeExpr &fromExpr =
         *Fortran::semantics::GetExpr(stmt2Expr);
-    elementType = converter.genType(fromExpr);
-    genOmpAccAtomicCaptureStatement<AtomicListT>(
-        converter, stmt1LHSArg, stmt2LHSArg,
-        /*leftHandClauseList=*/nullptr,
-        /*rightHandClauseList=*/nullptr, elementType, loc);
-    firOpBuilder.setInsertionPointToStart(&block);
+    mlir::Type elementType = converter.genType(fromExpr);
     genOmpAccAtomicUpdateStatement<AtomicListT>(
         converter, stmt1LHSArg, stmt1VarType, stmt1Var, stmt1Expr,
         /*leftHandClauseList=*/nullptr,
         /*rightHandClauseList=*/nullptr, loc, atomicCaptureOp);
+    genOmpAccAtomicCaptureStatement<AtomicListT>(
+        converter, stmt1LHSArg, stmt2LHSArg,
+        /*leftHandClauseList=*/nullptr,
+        /*rightHandClauseList=*/nullptr, elementType, loc);
   }
   firOpBuilder.setInsertionPointToEnd(&block);
   if constexpr (std::is_same<AtomicListT,
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 213b650..e768c1c 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -121,8 +121,11 @@ genProcBindKindAttr(fir::FirOpBuilder &firOpBuilder,
 }
 
 static mlir::omp::ClauseTaskDependAttr
-genDependKindAttr(fir::FirOpBuilder &firOpBuilder,
+genDependKindAttr(lower::AbstractConverter &converter,
                   const omp::clause::Depend::TaskDependenceType kind) {
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  mlir::Location currentLocation = converter.getCurrentLocation();
+
   mlir::omp::ClauseTaskDepend pbKind;
   switch (kind) {
   case omp::clause::Depend::TaskDependenceType::In:
@@ -136,6 +139,8 @@ genDependKindAttr(fir::FirOpBuilder &firOpBuilder,
     break;
   case omp::clause::Depend::TaskDependenceType::Mutexinoutset:
   case omp::clause::Depend::TaskDependenceType::Inoutset:
+    TODO(currentLocation, "INOUTSET and MUTEXINOUTSET are not supported yet");
+    break;
   case omp::clause::Depend::TaskDependenceType::Depobj:
   case omp::clause::Depend::TaskDependenceType::Sink:
   case omp::clause::Depend::TaskDependenceType::Source:
@@ -795,8 +800,6 @@ bool ClauseProcessor::processCopyprivate(
 }
 
 bool ClauseProcessor::processDepend(mlir::omp::DependClauseOps &result) const {
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-
   auto process = [&](const omp::clause::Depend &clause,
                      const parser::CharBlock &) {
     using Depend = omp::clause::Depend;
@@ -813,7 +816,7 @@ bool ClauseProcessor::processDepend(mlir::omp::DependClauseOps &result) const {
            "Support for iterator modifiers is not implemented yet");
     }
     mlir::omp::ClauseTaskDependAttr dependTypeOperand =
-        genDependKindAttr(firOpBuilder, kind);
+        genDependKindAttr(converter, kind);
     result.dependKinds.append(objects.size(), dependTypeOperand);
 
     for (const omp::Object &object : objects) {
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index 936d0d2..46caafe 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -347,8 +347,10 @@ makeDepType(const parser::OmpTaskDependenceType &inp) {
     return clause::TaskDependenceType::In;
   case parser::OmpTaskDependenceType::Type::Inout:
     return clause::TaskDependenceType::Inout;
-  // Inoutset        // missing-in-parser
-  // Mutexinoutset   // missing-in-parser
+  case parser::OmpTaskDependenceType::Type::Inoutset:
+    return clause::TaskDependenceType::Inoutset;
+  case parser::OmpTaskDependenceType::Type::Mutexinoutset:
+    return clause::TaskDependenceType::Mutexinoutset;
   case parser::OmpTaskDependenceType::Type::Out:
     return clause::TaskDependenceType::Out;
   case parser::OmpTaskDependenceType::Type::Sink:
diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
index 8b79187..993d416 100644
--- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
+++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
@@ -372,6 +372,29 @@ getAttrsFromVariable(fir::FortranVariableOpInterface var) {
   return attrs;
 }
 
+template <typename OMPTypeOp, typename DeclTypeOp>
+static Value getPrivateArg(omp::BlockArgOpenMPOpInterface &argIface,
+                           OMPTypeOp &op, DeclTypeOp &declOp) {
+  Value privateArg;
+  if (!op.getPrivateSyms().has_value())
+    return privateArg;
+  for (auto [opSym, blockArg] :
+       llvm::zip_equal(*op.getPrivateSyms(), argIface.getPrivateBlockArgs())) {
+    if (blockArg == declOp.getMemref()) {
+      omp::PrivateClauseOp privateOp =
+          SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(
+              op, cast<SymbolRefAttr>(opSym));
+      privateOp.walk([&](omp::YieldOp yieldOp) {
+        llvm::TypeSwitch<Operation *>(yieldOp.getResults()[0].getDefiningOp())
+            .template Case<fir::DeclareOp, hlfir::DeclareOp>(
+                [&](auto declOp) { privateArg = declOp.getMemref(); });
+      });
+      return privateArg;
+    }
+  }
+  return privateArg;
+}
+
 AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
                                                bool getInstantiationPoint) {
   auto *defOp = v.getDefiningOp();
@@ -470,20 +493,37 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
           breakFromLoop = true;
         })
         .Case<hlfir::DeclareOp, fir::DeclareOp>([&](auto op) {
-          // If declare operation is inside omp target region,
-          // continue alias analysis outside the target region
-          if (auto targetOp =
-                  llvm::dyn_cast<omp::TargetOp>(op->getParentOp())) {
-            auto argIface = cast<omp::BlockArgOpenMPOpInterface>(*targetOp);
-            for (auto [opArg, blockArg] : llvm::zip_equal(
-                     targetOp.getMapVars(), argIface.getMapBlockArgs())) {
-              if (blockArg == op.getMemref()) {
-                omp::MapInfoOp mapInfo =
-                    llvm::cast<omp::MapInfoOp>(opArg.getDefiningOp());
-                v = mapInfo.getVarPtr();
-                defOp = v.getDefiningOp();
-                return;
-              }
+          if (omp::BlockArgOpenMPOpInterface argIface =
+                  dyn_cast<omp::BlockArgOpenMPOpInterface>(op->getParentOp())) {
+            Value ompValArg;
+            llvm::TypeSwitch<Operation *>(op->getParentOp())
+                .template Case<omp::TargetOp>([&](auto targetOp) {
+                  // If declare operation is inside omp target region,
+                  // continue alias analysis outside the target region
+                  for (auto [opArg, blockArg] : llvm::zip_equal(
+                           targetOp.getMapVars(), argIface.getMapBlockArgs())) {
+                    if (blockArg == op.getMemref()) {
+                      omp::MapInfoOp mapInfo =
+                          llvm::cast<omp::MapInfoOp>(opArg.getDefiningOp());
+                      ompValArg = mapInfo.getVarPtr();
+                      break;
+                    }
+                  }
+                  // If given operation does not reflect mapping item,
+                  // check private clause
+                  if (!ompValArg)
+                    ompValArg = getPrivateArg(argIface, targetOp, op);
+                })
+                .template Case<omp::DistributeOp, omp::ParallelOp,
+                               omp::SectionsOp, omp::SimdOp, omp::SingleOp,
+                               omp::TaskloopOp, omp::TaskOp, omp::WsloopOp>(
+                    [&](auto privateOp) {
+                      ompValArg = getPrivateArg(argIface, privateOp, op);
+                    });
+            if (ompValArg) {
+              v = ompValArg;
+              defOp = ompValArg.getDefiningOp();
+              return;
             }
           }
           auto varIf = llvm::cast<fir::FortranVariableOpInterface>(defOp);
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 3c139f7..a914407 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -264,10 +264,10 @@ void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
   addNestedPassToAllTopLevelOperations(pm, fir::createAbstractResultOpt);
   fir::addCodeGenRewritePass(
       pm, (config.DebugInfo != llvm::codegenoptions::NoDebugInfo));
-  fir::addTargetRewritePass(pm);
-  fir::addCompilerGeneratedNamesConversionPass(pm);
   fir::addExternalNameConversionPass(pm, config.Underscoring);
   fir::createDebugPasses(pm, config.DebugInfo, config.OptLevel, inputFilename);
+  fir::addTargetRewritePass(pm);
+  fir::addCompilerGeneratedNamesConversionPass(pm);
 
   if (config.VScaleMin != 0)
     pm.addPass(fir::createVScaleAttr({{config.VScaleMin, config.VScaleMax}}));
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index a28d0a5..89d0af1 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -23,6 +23,7 @@
 #include "flang/Runtime/allocatable.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/IR/Matchers.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -439,6 +440,14 @@ static bool isDstGlobal(cuf::DataTransferOp op) {
   return false;
 }
 
+static mlir::Value getShapeFromDecl(mlir::Value src) {
+  if (auto declareOp = src.getDefiningOp<fir::DeclareOp>())
+    return declareOp.getShape();
+  if (auto declareOp = src.getDefiningOp<hlfir::DeclareOp>())
+    return declareOp.getShape();
+  return mlir::Value{};
+}
+
 struct CUFDataTransferOpConversion
     : public mlir::OpRewritePattern<cuf::DataTransferOp> {
   using OpRewritePattern::OpRewritePattern;
@@ -528,54 +537,54 @@ struct CUFDataTransferOpConversion
     }
 
     // Conversion of data transfer involving at least one descriptor.
-    if (mlir::isa<fir::BaseBoxType>(srcTy) &&
-        mlir::isa<fir::BaseBoxType>(dstTy)) {
-      // Transfer between two descriptor.
+    if (mlir::isa<fir::BaseBoxType>(dstTy)) {
+      // Transfer to a descriptor.
       mlir::func::FuncOp func =
           isDstGlobal(op)
               ? fir::runtime::getRuntimeFunc<mkRTKey(
                     CUFDataTransferGlobalDescDesc)>(loc, builder)
               : fir::runtime::getRuntimeFunc<mkRTKey(CUFDataTransferDescDesc)>(
                     loc, builder);
-
-      auto fTy = func.getFunctionType();
-      mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
-      mlir::Value sourceLine =
-          fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
       mlir::Value dst = op.getDst();
       mlir::Value src = op.getSrc();
-      llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
-          builder, loc, fTy, dst, src, modeValue, sourceFile, sourceLine)};
-      builder.create<fir::CallOp>(loc, func, args);
-      rewriter.eraseOp(op);
-    } else if (mlir::isa<fir::BaseBoxType>(dstTy) && fir::isa_trivial(srcTy)) {
-      // Scalar to descriptor transfer.
-      mlir::Value val = op.getSrc();
-      if (op.getSrc().getDefiningOp() &&
-          mlir::isa<mlir::arith::ConstantOp>(op.getSrc().getDefiningOp())) {
-        mlir::Value alloc = builder.createTemporary(loc, srcTy);
-        builder.create<fir::StoreOp>(loc, op.getSrc(), alloc);
-        val = alloc;
+
+      if (!mlir::isa<fir::BaseBoxType>(srcTy)) {
+        // If src is not a descriptor, create one.
+        mlir::Value addr;
+        if (fir::isa_trivial(srcTy) &&
+            mlir::matchPattern(op.getSrc().getDefiningOp(),
+                               mlir::m_Constant())) {
+          // Put constant in memory if it is not.
+          mlir::Value alloc = builder.createTemporary(loc, srcTy);
+          builder.create<fir::StoreOp>(loc, op.getSrc(), alloc);
+          addr = alloc;
+        } else {
+          addr = getDeviceAddress(rewriter, op.getSrcMutable(), symtab);
+        }
+        mlir::Type boxTy = fir::BoxType::get(srcTy);
+        llvm::SmallVector<mlir::Value> lenParams;
+        mlir::Value box =
+            builder.createBox(loc, boxTy, addr, getShapeFromDecl(src),
+                              /*slice=*/nullptr, lenParams,
+                              /*tdesc=*/nullptr);
+        mlir::Value memBox = builder.createTemporary(loc, box.getType());
+        builder.create<fir::StoreOp>(loc, box, memBox);
+        src = memBox;
       }
 
-      mlir::func::FuncOp func =
-          fir::runtime::getRuntimeFunc<mkRTKey(CUFMemsetDescriptor)>(loc,
-                                                                     builder);
       auto fTy = func.getFunctionType();
       mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
       mlir::Value sourceLine =
-          fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
+          fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
       llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
-          builder, loc, fTy, op.getDst(), val, sourceFile, sourceLine)};
+          builder, loc, fTy, dst, src, modeValue, sourceFile, sourceLine)};
       builder.create<fir::CallOp>(loc, func, args);
       rewriter.eraseOp(op);
     } else {
       // Type used to compute the width.
       mlir::Type computeType = dstTy;
       auto seqTy = mlir::dyn_cast<fir::SequenceType>(dstTy);
-      bool dstIsDesc = false;
       if (mlir::isa<fir::BaseBoxType>(dstTy)) {
-        dstIsDesc = true;
         computeType = srcTy;
         seqTy = mlir::dyn_cast<fir::SequenceType>(srcTy);
       }
@@ -606,11 +615,8 @@ struct CUFDataTransferOpConversion
           rewriter.create<mlir::arith::MulIOp>(loc, nbElement, widthValue);
 
       mlir::func::FuncOp func =
-          dstIsDesc
-              ? fir::runtime::getRuntimeFunc<mkRTKey(CUFDataTransferDescPtr)>(
-                    loc, builder)
-              : fir::runtime::getRuntimeFunc<mkRTKey(CUFDataTransferPtrDesc)>(
-                    loc, builder);
+          fir::runtime::getRuntimeFunc<mkRTKey(CUFDataTransferPtrDesc)>(
+              loc, builder);
       auto fTy = func.getFunctionType();
       mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
       mlir::Value sourceLine =
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 0510b32..7a0ecc5 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -402,7 +402,9 @@ TYPE_PARSER(
 TYPE_PARSER(construct<OmpTaskDependenceType>(
     "DEPOBJ" >> pure(OmpTaskDependenceType::Type::Depobj) ||
     "IN"_id >> pure(OmpTaskDependenceType::Type::In) ||
-    "INOUT" >> pure(OmpTaskDependenceType::Type::Inout) ||
+    "INOUT"_id >> pure(OmpTaskDependenceType::Type::Inout) ||
+    "INOUTSET"_id >> pure(OmpTaskDependenceType::Type::Inoutset) ||
+    "MUTEXINOUTSET" >> pure(OmpTaskDependenceType::Type::Mutexinoutset) ||
     "OUT" >> pure(OmpTaskDependenceType::Type::Out) ||
     "SINK" >> pure(OmpTaskDependenceType::Type::Sink) ||
     "SOURCE" >> pure(OmpTaskDependenceType::Type::Source)))
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 749d588..cdbda1a 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -1732,6 +1732,45 @@ void OmpStructureChecker::CheckTargetUpdate() {
   }
 }
 
+void OmpStructureChecker::CheckTaskDependenceType(
+    const parser::OmpTaskDependenceType::Type &x) {
+  // Common checks for task-dependence-type (DEPEND and UPDATE clauses).
+  unsigned version{context_.langOptions().OpenMPVersion};
+  unsigned since{0}, deprecatedIn{~0u};
+
+  switch (x) {
+  case parser::OmpTaskDependenceType::Type::In:
+  case parser::OmpTaskDependenceType::Type::Out:
+  case parser::OmpTaskDependenceType::Type::Inout:
+    break;
+  case parser::OmpTaskDependenceType::Type::Source:
+  case parser::OmpTaskDependenceType::Type::Sink:
+    deprecatedIn = 52;
+    break;
+  case parser::OmpTaskDependenceType::Type::Mutexinoutset:
+  case parser::OmpTaskDependenceType::Type::Depobj:
+    since = 50;
+    break;
+  case parser::OmpTaskDependenceType::Type::Inoutset:
+    since = 52;
+    break;
+  }
+
+  if (version >= deprecatedIn) {
+    context_.Say(GetContext().clauseSource,
+        "%s task-dependence-type is deprecated in %s"_warn_en_US,
+        parser::ToUpperCaseLetters(
+            parser::OmpTaskDependenceType::EnumToString(x)),
+        ThisVersion(deprecatedIn));
+  } else if (version < since) {
+    context_.Say(GetContext().clauseSource,
+        "%s task-dependence-type is not supported in %s, %s"_warn_en_US,
+        parser::ToUpperCaseLetters(
+            parser::OmpTaskDependenceType::EnumToString(x)),
+        ThisVersion(version), TryVersion(since));
+  }
+}
+
 void OmpStructureChecker::Enter(
     const parser::OpenMPSimpleStandaloneConstruct &x) {
   const auto &dir{std::get<parser::OmpSimpleStandaloneDirective>(x.t)};
@@ -3393,20 +3432,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Depend &x) {
   using DepType = parser::OmpTaskDependenceType::Type;
   DepType depType = x.v.GetDepType();
 
-  if (version >= 52) {
-    switch (depType) {
-    case DepType::Sink:
-    case DepType::Source:
-      context_.Say(GetContext().clauseSource,
-          "The %s task-dependence-type is deprecated in %s"_warn_en_US,
-          parser::ToUpperCaseLetters(
-              parser::OmpTaskDependenceType::EnumToString(depType)),
-          ThisVersion(version));
-      break;
-    default:
-      break;
-    }
-  }
+  CheckTaskDependenceType(depType);
 
   if (directive == llvm::omp::OMPD_depobj) {
     // [5.0:255:11], [5.1:288:3]
@@ -3593,6 +3619,8 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Update &x) {
   llvm::omp::Directive directive{GetContext().directive};
   unsigned version{context_.langOptions().OpenMPVersion};
 
+  CheckTaskDependenceType(x.v.v.v);
+
   // [5.1:288:4-5]
   // An update clause on a depobj construct must not have source, sink or depobj
   // as dependence-type.
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index 5e26827..d9236be 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -202,6 +202,7 @@ private:
   void CheckSIMDNest(const parser::OpenMPConstruct &x);
   void CheckTargetNest(const parser::OpenMPConstruct &x);
   void CheckTargetUpdate();
+  void CheckTaskDependenceType(const parser::OmpTaskDependenceType::Type &x);
   void CheckCancellationNest(
       const parser::CharBlock &source, const parser::OmpCancelType::Type &type);
   std::int64_t GetOrdCollapseLevel(const parser::OpenMPLoopConstruct &x);
diff --git a/flang/runtime/CUDA/memory.cpp b/flang/runtime/CUDA/memory.cpp
index 0e03c61..2d499f9 100644
--- a/flang/runtime/CUDA/memory.cpp
+++ b/flang/runtime/CUDA/memory.cpp
@@ -96,13 +96,6 @@ void RTDEF(CUFDataTransferPtrPtr)(void *dst, void *src, std::size_t bytes,
   CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, bytes, kind));
 }
 
-void RTDEF(CUFDataTransferDescPtr)(Descriptor *desc, void *addr,
-    std::size_t bytes, unsigned mode, const char *sourceFile, int sourceLine) {
-  Terminator terminator{sourceFile, sourceLine};
-  terminator.Crash(
-      "not yet implemented: CUDA data transfer from a pointer to a descriptor");
-}
-
 void RTDEF(CUFDataTransferPtrDesc)(void *addr, Descriptor *desc,
     std::size_t bytes, unsigned mode, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
diff --git a/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-teams-distribute-private-ptr.mlir b/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-teams-distribute-private-ptr.mlir
new file mode 100644
index 0000000..78207d2
--- /dev/null
+++ b/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-teams-distribute-private-ptr.mlir
@@ -0,0 +1,102 @@
+// Use --mlir-disable-threading so that the AA queries are serialized
+// as well as its diagnostic output.
+// RUN: fir-opt %s -pass-pipeline='builtin.module(func.func(test-fir-alias-analysis))' -split-input-file --mlir-disable-threading 2>&1 | FileCheck %s
+
+// Fortran code:
+// program main
+// integer, target :: arrayA(10)
+// integer, pointer, dimension(:) :: ptrA
+// integer :: i
+// ptrA => arrayA
+// !$omp teams distribute parallel do firstprivate(ptrA)
+// do i = 1, 10
+//   arrayA(i) = arrayA(i) + ptrA(i);
+// end do
+// end program main
+
+// CHECK-LABEL: Testing : "_QQmain"
+// CHECK-DAG:   ptrA#0 <-> ArrayA#0: MayAlias
+
+omp.private {type = private} @_QFEi_private_ref_i32 : !fir.ref<i32> alloc {
+^bb0(%arg0: !fir.ref<i32>):
+  %0 = fir.alloca i32 {bindc_name = "i", pinned, uniq_name = "_QFEi"}
+  %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  omp.yield(%1#0 : !fir.ref<i32>)
+}
+omp.private {type = firstprivate} @_QFEptra_firstprivate_ref_box_ptr_Uxi32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> alloc {
+^bb0(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
+  %0 = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {bindc_name = "ptra", pinned, uniq_name = "_QFEptra"}
+  %1:2 = hlfir.declare %0 {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFEptra"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+  omp.yield(%1#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+} copy {
+^bb0(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, %arg1: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
+  %0 = fir.load %arg0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+  fir.store %0 to %arg1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+  omp.yield(%arg1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+}
+func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+  %0 = fir.address_of(@_QFEarraya) : !fir.ref<!fir.array<10xi32>>
+  %c10 = arith.constant 10 : index
+  %1 = fir.shape %c10 : (index) -> !fir.shape<1>
+  %2:2 = hlfir.declare %0(%1) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFEarraya"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+  %3 = fir.address_of(@_QFEarrayb) : !fir.ref<!fir.array<10xi32>>
+  %c10_0 = arith.constant 10 : index
+  %4 = fir.shape %c10_0 : (index) -> !fir.shape<1>
+  %5:2 = hlfir.declare %3(%4) {uniq_name = "_QFEarrayb"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+  %6 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+  %7:2 = hlfir.declare %6 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %8 = fir.address_of(@_QFEptra) : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+  %9:2 = hlfir.declare %8 {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFEptra"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+  %10 = fir.shape %c10 : (index) -> !fir.shape<1>
+  %11 = fir.embox %2#1(%10) : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
+  fir.store %11 to %9#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+  omp.teams {
+    omp.parallel private(@_QFEptra_firstprivate_ref_box_ptr_Uxi32 %9#0 -> %arg0, @_QFEi_private_ref_i32 %7#0 -> %arg1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<i32>) {
+      %12:2 = hlfir.declare %arg0 {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFEptra"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+      %13:2 = hlfir.declare %arg1 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+      %c1_i32 = arith.constant 1 : i32
+      %c10_i32 = arith.constant 10 : i32
+      %c1_i32_1 = arith.constant 1 : i32
+      omp.distribute {
+        omp.wsloop {
+          omp.loop_nest (%arg2) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32_1) {
+            fir.store %arg2 to %13#1 : !fir.ref<i32>
+            %14 = fir.load %13#0 : !fir.ref<i32>
+            %15 = fir.convert %14 : (i32) -> i64
+            %16 = hlfir.designate %2#0 (%15)  : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+            %17 = fir.load %16 : !fir.ref<i32>
+            %18 = fir.load %12#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+            %19 = fir.load %13#0 : !fir.ref<i32>
+            %20 = fir.convert %19 : (i32) -> i64
+            %21 = hlfir.designate %18 (%20) {test.ptr = "ptrA" } : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, i64) -> !fir.ref<i32>
+            %22 = fir.load %21 : !fir.ref<i32>
+            %23 = arith.addi %17, %22 : i32
+            %24 = fir.load %13#0 : !fir.ref<i32>
+            %25 = fir.convert %24 : (i32) -> i64
+            %26 = hlfir.designate %2#0 (%25) {test.ptr = "ArrayA"}  : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+            hlfir.assign %23 to %26 : i32, !fir.ref<i32>
+            omp.yield
+          }
+        } {omp.composite}
+      } {omp.composite}
+      omp.terminator
+    } {omp.composite}
+    omp.terminator
+  }
+  return
+}
+fir.global internal @_QFEarraya target : !fir.array<10xi32> {
+  %0 = fir.zero_bits !fir.array<10xi32>
+  fir.has_value %0 : !fir.array<10xi32>
+}
+fir.global internal @_QFEarrayb : !fir.array<10xi32> {
+  %0 = fir.zero_bits !fir.array<10xi32>
+  fir.has_value %0 : !fir.array<10xi32>
+}
+fir.global internal @_QFEptra : !fir.box<!fir.ptr<!fir.array<?xi32>>> {
+  %0 = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
+  %c0 = arith.constant 0 : index
+  %1 = fir.shape %c0 : (index) -> !fir.shape<1>
+  %2 = fir.embox %0(%1) : (!fir.ptr<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
+  fir.has_value %2 : !fir.box<!fir.ptr<!fir.array<?xi32>>>
+}
diff --git a/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-teams-distribute-private.mlir b/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-teams-distribute-private.mlir
new file mode 100644
index 0000000..4668b2c
--- /dev/null
+++ b/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-teams-distribute-private.mlir
@@ -0,0 +1,121 @@
+// Use --mlir-disable-threading so that the AA queries are serialized
+// as well as its diagnostic output.
+// RUN: fir-opt %s -pass-pipeline='builtin.module(func.func(test-fir-alias-analysis))' -split-input-file --mlir-disable-threading 2>&1 | FileCheck %s
+
+// Fortran code:
+//
+// program main
+// integer :: arrayA(10,10)
+// integer :: tmp(2)
+// integer :: i,j
+// !$omp teams distribute parallel do private(tmp)
+// do j = 1, 10
+//   do i = 1,10
+//     tmp = [i,j]
+//     arrayA = tmp(1)
+//   end do
+// end do
+// end program main
+
+// CHECK-LABEL: Testing : "_QQmain"
+// CHECK-DAG: tmp_private_array#0 <-> unnamed_array#0: NoAlias
+// CHECK-DAG: tmp_private_array#1 <-> unnamed_array#0: NoAlias
+
+omp.private {type = private} @_QFEi_private_ref_i32 : !fir.ref<i32> alloc {
+^bb0(%arg0: !fir.ref<i32>):
+  %0 = fir.alloca i32 {bindc_name = "i", pinned, uniq_name = "_QFEi"}
+  %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  omp.yield(%1#0 : !fir.ref<i32>)
+}
+omp.private {type = private} @_QFEj_private_ref_i32 : !fir.ref<i32> alloc {
+^bb0(%arg0: !fir.ref<i32>):
+  %0 = fir.alloca i32 {bindc_name = "j", pinned, uniq_name = "_QFEj"}
+  %1:2 = hlfir.declare %0 {uniq_name = "_QFEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  omp.yield(%1#0 : !fir.ref<i32>)
+}
+omp.private {type = private} @_QFEtmp_private_ref_2xi32 : !fir.ref<!fir.array<2xi32>> alloc {
+^bb0(%arg0: !fir.ref<!fir.array<2xi32>>):
+  %c2 = arith.constant 2 : index
+  %0 = fir.alloca !fir.array<2xi32> {bindc_name = "tmp", pinned, uniq_name = "_QFEtmp"}
+  %1 = fir.shape %c2 : (index) -> !fir.shape<1>
+  %2:2 = hlfir.declare %0(%1) {uniq_name = "_QFEtmp"} : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>)
+  omp.yield(%2#0 : !fir.ref<!fir.array<2xi32>>)
+}
+func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+  %0 = fir.address_of(@_QFEarraya) : !fir.ref<!fir.array<10x10xi32>>
+  %c10 = arith.constant 10 : index
+  %c10_0 = arith.constant 10 : index
+  %1 = fir.shape %c10, %c10_0 : (index, index) -> !fir.shape<2>
+  %2:2 = hlfir.declare %0(%1) {uniq_name = "_QFEarraya"} : (!fir.ref<!fir.array<10x10xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<10x10xi32>>, !fir.ref<!fir.array<10x10xi32>>)
+  %3 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+  %4:2 = hlfir.declare %3 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %5 = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFEj"}
+  %6:2 = hlfir.declare %5 {uniq_name = "_QFEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %c2 = arith.constant 2 : index
+  %7 = fir.alloca !fir.array<2xi32> {bindc_name = "tmp", uniq_name = "_QFEtmp"}
+  %8 = fir.shape %c2 : (index) -> !fir.shape<1>
+  %9:2 = hlfir.declare %7(%8) {uniq_name = "_QFEtmp"} : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>)
+  omp.teams {
+    omp.parallel private(@_QFEtmp_private_ref_2xi32 %9#0 -> %arg0, @_QFEj_private_ref_i32 %6#0 -> %arg1, @_QFEi_private_ref_i32 %4#0 -> %arg2 : !fir.ref<!fir.array<2xi32>>, !fir.ref<i32>, !fir.ref<i32>) {
+      %c2_1 = arith.constant 2 : index
+      %10 = fir.shape %c2_1 : (index) -> !fir.shape<1>
+      %11:2 = hlfir.declare %arg0(%10) {uniq_name = "_QFEtmp", test.ptr = "tmp_private_array"} : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>)
+      %12:2 = hlfir.declare %arg1 {uniq_name = "_QFEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+      %13:2 = hlfir.declare %arg2 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+      %c1_i32 = arith.constant 1 : i32
+      %c10_i32 = arith.constant 10 : i32
+      %c1_i32_2 = arith.constant 1 : i32
+      omp.distribute {
+        omp.wsloop {
+          omp.loop_nest (%arg3) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32_2) {
+            fir.store %arg3 to %12#1 : !fir.ref<i32>
+            %c1_i32_3 = arith.constant 1 : i32
+            %14 = fir.convert %c1_i32_3 : (i32) -> index
+            %c10_i32_4 = arith.constant 10 : i32
+            %15 = fir.convert %c10_i32_4 : (i32) -> index
+            %c1 = arith.constant 1 : index
+            %16 = fir.convert %14 : (index) -> i32
+            %17:2 = fir.do_loop %arg4 = %14 to %15 step %c1 iter_args(%arg5 = %16) -> (index, i32) {
+              fir.store %arg5 to %13#1 : !fir.ref<i32>
+              %c2_5 = arith.constant 2 : index
+              %c1_6 = arith.constant 1 : index
+              %c1_7 = arith.constant 1 : index
+              %18 = fir.allocmem !fir.array<2xi32> {bindc_name = ".tmp.arrayctor", uniq_name = ""}
+              %19 = fir.shape %c2_5 : (index) -> !fir.shape<1>
+              %20:2 = hlfir.declare %18(%19) {uniq_name = ".tmp.arrayctor"} : (!fir.heap<!fir.array<2xi32>>, !fir.shape<1>) -> (!fir.heap<!fir.array<2xi32>>, !fir.heap<!fir.array<2xi32>>)
+              %21 = fir.load %13#0 : !fir.ref<i32>
+              %22 = arith.addi %c1_6, %c1_7 : index
+              %23 = hlfir.designate %20#0 (%c1_6)  : (!fir.heap<!fir.array<2xi32>>, index) -> !fir.ref<i32>
+              hlfir.assign %21 to %23 : i32, !fir.ref<i32>
+              %24 = fir.load %12#0 : !fir.ref<i32>
+              %25 = hlfir.designate %20#0 (%22)  : (!fir.heap<!fir.array<2xi32>>, index) -> !fir.ref<i32>
+              hlfir.assign %24 to %25 : i32, !fir.ref<i32>
+              %true = arith.constant true
+              %26 = hlfir.as_expr %20#0 move %true {test.ptr = "unnamed_array"} : (!fir.heap<!fir.array<2xi32>>, i1) -> !hlfir.expr<2xi32>
+              hlfir.assign %26 to %11#0 : !hlfir.expr<2xi32>, !fir.ref<!fir.array<2xi32>>
+              hlfir.destroy %26 : !hlfir.expr<2xi32>
+              %c1_8 = arith.constant 1 : index
+              %27 = hlfir.designate %11#0 (%c1_8)  : (!fir.ref<!fir.array<2xi32>>, index) -> !fir.ref<i32>
+              %28 = fir.load %27 : !fir.ref<i32>
+              hlfir.assign %28 to %2#0 : i32, !fir.ref<!fir.array<10x10xi32>>
+              %29 = arith.addi %arg4, %c1 : index
+              %30 = fir.convert %c1 : (index) -> i32
+              %31 = fir.load %13#1 : !fir.ref<i32>
+              %32 = arith.addi %31, %30 : i32
+              fir.result %29, %32 : index, i32
+            }
+            fir.store %17#1 to %13#1 : !fir.ref<i32>
+            omp.yield
+          }
+        } {omp.composite}
+      } {omp.composite}
+      omp.terminator
+    } {omp.composite}
+    omp.terminator
+  }
+  return
+}
+fir.global internal @_QFEarraya : !fir.array<10x10xi32> {
+  %0 = fir.zero_bits !fir.array<10x10xi32>
+  fir.has_value %0 : !fir.array<10x10xi32>
+}
diff --git a/flang/test/Driver/mlir-debug-pass-pipeline.f90 b/flang/test/Driver/mlir-debug-pass-pipeline.f90
index e44f4e6..ab5dded 100644
--- a/flang/test/Driver/mlir-debug-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-debug-pass-pipeline.f90
@@ -111,10 +111,10 @@ end program
 
 ! ALL-NEXT: CodeGenRewrite
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations eliminated
-! ALL-NEXT: TargetRewrite
-! ALL-NEXT: CompilerGeneratedNamesConversion
 ! ALL-NEXT: ExternalNameConversion
 ! DEBUG-NEXT: AddDebugInfo
 ! NO-DEBUG-NOT: AddDebugInfo
+! ALL-NEXT: TargetRewrite
+! ALL-NEXT: CompilerGeneratedNamesConversion
 ! ALL: FIRToLLVMLowering
 ! ALL-NOT: LLVMIRLoweringPass
diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90
index 6c2829d..7d57135 100644
--- a/flang/test/Driver/mlir-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-pass-pipeline.f90
@@ -120,8 +120,8 @@ end program
 
 ! ALL-NEXT: CodeGenRewrite
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations eliminated
+! ALL-NEXT: ExternalNameConversion
 ! ALL-NEXT: TargetRewrite
 ! ALL-NEXT: CompilerGeneratedNamesConversion
-! ALL-NEXT: ExternalNameConversion
 ! ALL-NEXT: FIRToLLVMLowering
 ! ALL-NOT: LLVMIRLoweringPass
diff --git a/flang/test/Fir/CUDA/cuda-data-transfer.fir b/flang/test/Fir/CUDA/cuda-data-transfer.fir
index a760650..6a33190 100644
--- a/flang/test/Fir/CUDA/cuda-data-transfer.fir
+++ b/flang/test/Fir/CUDA/cuda-data-transfer.fir
@@ -29,13 +29,16 @@ func.func @_QPsub2() {
 }
 
 // CHECK-LABEL: func.func @_QPsub2()
+// CHECK: %[[TEMP_BOX:.*]] = fir.alloca !fir.box<i32>
 // CHECK: %[[TEMP:.*]] = fir.alloca i32
 // CHECK: %[[ADEV:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub2Eadev"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 // CHECK: %[[C2:.*]] = arith.constant 2 : i32
 // CHECK: fir.store %[[C2]] to %[[TEMP]] : !fir.ref<i32>
+// CHECK: %[[EMBOX:.*]] = fir.embox %[[TEMP]] : (!fir.ref<i32>) -> !fir.box<i32>
+// CHECK: fir.store %[[EMBOX]] to %[[TEMP_BOX]] : !fir.ref<!fir.box<i32>>
 // CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: %[[TEMP_CONV:.*]] = fir.convert %[[TEMP]] : (!fir.ref<i32>) -> !fir.llvm_ptr<i8>
-// CHECK: fir.call @_FortranACUFMemsetDescriptor(%[[ADEV_BOX]], %[[TEMP_CONV]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>, !fir.ref<i8>, i32) -> none
+// CHECK: %[[TEMP_CONV:.*]] = fir.convert %[[TEMP_BOX]] : (!fir.ref<!fir.box<i32>>) -> !fir.ref<!fir.box<none>>
+// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%[[ADEV_BOX]], %[[TEMP_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
 
 func.func @_QPsub3() {
   %0 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "adev", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub3Eadev"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
@@ -48,12 +51,15 @@ func.func @_QPsub3() {
 }
 
 // CHECK-LABEL: func.func @_QPsub3()
+// CHECK: %[[TEMP_BOX:.*]] = fir.alloca !fir.box<i32>
 // CHECK: %[[ADEV:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub3Eadev"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 // CHECK: %[[V:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsub3Ev"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK: %[[EMBOX:.*]] = fir.embox %[[V]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+// CHECK: fir.store %[[EMBOX]] to %[[TEMP_BOX]] : !fir.ref<!fir.box<i32>>
 // CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: %[[V_CONV:.*]] = fir.convert %[[V]]#0 : (!fir.ref<i32>) -> !fir.llvm_ptr<i8>
-// CHECK: fir.call @_FortranACUFMemsetDescriptor(%[[ADEV_BOX]], %[[V_CONV]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>, !fir.ref<i8>, i32) -> none
-
+// CHECK: %[[V_CONV:.*]] = fir.convert %[[TEMP_BOX]] : (!fir.ref<!fir.box<i32>>) -> !fir.ref<!fir.box<none>>
+// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%[[ADEV_BOX]], %[[V_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
+  
 func.func @_QPsub4() {
   %0 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "adev", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub4Eadev"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
   %4:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub4Eadev"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
@@ -67,15 +73,14 @@ func.func @_QPsub4() {
   return
 }
 // CHECK-LABEL: func.func @_QPsub4()
+// CHECK: %[[TEMP_BOX:.*]] = fir.alloca !fir.box<!fir.array<10xi32>>
 // CHECK: %[[ADEV:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub4Eadev"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
-// CHECK: %[[AHOST:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub4Eahost"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
-// CHECK: %[[NBELEM:.*]] = arith.constant 10 : index
-// CHECK: %[[WIDTH:.*]] = arith.constant 4 : index
-// CHECK: %[[BYTES:.*]] = arith.muli %[[NBELEM]], %[[WIDTH]] : index
+// CHECK: %[[AHOST:.*]]:2 = hlfir.declare %{{.*}}(%[[AHOST_SHAPE:.*]]) {uniq_name = "_QFsub4Eahost"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+// CHECK: %[[EMBOX:.*]] = fir.embox %[[AHOST]]#0(%[[AHOST_SHAPE]]) : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<10xi32>>
+// CHECK: fir.store %[[EMBOX]] to %[[TEMP_BOX]] : !fir.ref<!fir.box<!fir.array<10xi32>>>
 // CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: %[[AHOST_PTR:.*]] = fir.convert %[[AHOST]]#0 : (!fir.ref<!fir.array<10xi32>>) -> !fir.llvm_ptr<i8>
-// CHECK: %[[BYTES_CONV:.*]] = fir.convert %[[BYTES]] : (index) -> i64
-// CHECK: fir.call @_FortranACUFDataTransferDescPtr(%[[ADEV_BOX]], %[[AHOST_PTR]], %[[BYTES_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> none
+// CHECK: %[[AHOST_BOX:.*]] = fir.convert %[[TEMP_BOX]] : (!fir.ref<!fir.box<!fir.array<10xi32>>>) -> !fir.ref<!fir.box<none>>
+// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%[[ADEV_BOX]], %[[AHOST_BOX]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
 // CHECK: %[[NBELEM:.*]] = arith.constant 10 : index
 // CHECK: %[[WIDTH:.*]] = arith.constant 4 : index
 // CHECK: %[[BYTES:.*]] = arith.muli %[[NBELEM]], %[[WIDTH]] : index
@@ -110,16 +115,15 @@ func.func @_QPsub5(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}) {
 }
 
 // CHECK-LABEL: func.func @_QPsub5
+// CHECK: %[[TEMP_BOX:.*]] = fir.alloca !fir.box<!fir.array<?x?xi32>>
 // CHECK: %[[ADEV:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub5Eadev"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
 // CHECK: %[[SHAPE:.*]] = fir.shape %[[I1:.*]], %[[I2:.*]] : (index, index) -> !fir.shape<2>
 // CHECK: %[[AHOST:.*]]:2 = hlfir.declare %{{.*}}(%[[SHAPE]]) {uniq_name = "_QFsub5Eahost"} : (!fir.ref<!fir.array<?x?xi32>>, !fir.shape<2>) -> (!fir.box<!fir.array<?x?xi32>>, !fir.ref<!fir.array<?x?xi32>>)
-// CHECK: %[[NBELEM:.*]] = arith.muli %[[I1]], %[[I2]] : index
-// CHECK: %[[WIDTH:.*]] = arith.constant 4 : index
-// CHECK: %[[BYTES:.*]] = arith.muli %[[NBELEM]], %[[WIDTH]] : index
+// CHECK: %[[EMBOX:.*]] = fir.embox %[[AHOST]]#1(%[[SHAPE]]) : (!fir.ref<!fir.array<?x?xi32>>, !fir.shape<2>) -> !fir.box<!fir.array<?x?xi32>>
+// CHECK: fir.store %[[EMBOX]] to %[[TEMP_BOX]] : !fir.ref<!fir.box<!fir.array<?x?xi32>>>
 // CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: %[[AHOST_PTR:.*]] = fir.convert %[[AHOST]]#1 : (!fir.ref<!fir.array<?x?xi32>>) -> !fir.llvm_ptr<i8>
-// CHECK: %[[BYTES_CONV:.*]] = fir.convert %[[BYTES]] : (index) -> i64
-// CHECK: fir.call @_FortranACUFDataTransferDescPtr(%[[ADEV_BOX]], %[[AHOST_PTR]], %[[BYTES_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> none
+// CHECK: %[[AHOST_BOX:.*]] = fir.convert %[[TEMP_BOX]] : (!fir.ref<!fir.box<!fir.array<?x?xi32>>>) -> !fir.ref<!fir.box<none>>
+// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%[[ADEV_BOX]], %[[AHOST_BOX]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
 // CHECK: %[[NBELEM:.*]] = arith.muli %[[I1]], %[[I2]] : index
 // CHECK: %[[WIDTH:.*]] = arith.constant 4 : index
 // CHECK: %[[BYTES:.*]] = arith.muli %[[NBELEM]], %[[WIDTH]] : index
@@ -248,5 +252,35 @@ func.func @_QQdesc_global() attributes {fir.bindc_name = "host_sub"} {
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[GLOBAL_DECL:.*]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: fir.call @_FortranACUFDataTransferGlobalDescDesc(%[[BOX_NONE]],{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
 
+fir.global @_QMmod2Eadev {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?xi32>>> {
+  %c0 = arith.constant 0 : index
+  %0 = fir.zero_bits !fir.heap<!fir.array<?xi32>>
+  %1 = fir.shape %c0 : (index) -> !fir.shape<1>
+  %2 = fir.embox %0(%1) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xi32>>>
+  fir.has_value %2 : !fir.box<!fir.heap<!fir.array<?xi32>>>
+}
+func.func @_QPdesc_global_ptr() {
+  %c10 = arith.constant 10 : index
+  %0 = fir.address_of(@_QMmod2Eadev) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+  %1 = fir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod2Eadev"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+  %2 = fir.alloca !fir.array<10xi32> {bindc_name = "ahost", uniq_name = "_QFdesc_global_ptrEahost"}
+  %3 = fir.shape %c10 : (index) -> !fir.shape<1>
+  %4 = fir.declare %2(%3) {uniq_name = "_QFdesc_global_ptrEahost"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<10xi32>>
+  cuf.data_transfer %4 to %1 {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+  return
+}
+
+// CHECK-LABEL: func.func @_QPdesc_global_ptr()
+// CHECK: %[[TEMP_BOX:.*]] = fir.alloca !fir.box<!fir.array<10xi32>>
+// CHECK: %[[ADDR_ADEV:.*]] = fir.address_of(@_QMmod2Eadev) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+// CHECK: %[[DECL_ADEV:.*]] = fir.declare %[[ADDR_ADEV]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod2Eadev"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+// CHECK: %[[AHOST:.*]] = fir.alloca !fir.array<10xi32> {bindc_name = "ahost", uniq_name = "_QFdesc_global_ptrEahost"}
+// CHECK: %[[SHAPE:.*]] = fir.shape %c10 : (index) -> !fir.shape<1>
+// CHECK: %[[DECL_AHOST:.*]] = fir.declare %[[AHOST]](%[[SHAPE]]) {uniq_name = "_QFdesc_global_ptrEahost"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<10xi32>>
+// CHECK: %[[EMBOX:.*]] = fir.embox %[[DECL_AHOST]](%[[SHAPE]]) : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<10xi32>>
+// CHECK: fir.store %[[EMBOX]] to %[[TEMP_BOX]] : !fir.ref<!fir.box<!fir.array<10xi32>>>
+// CHECK: %[[ADEV_BOXNONE:.*]] = fir.convert %[[DECL_ADEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
+// CHECK: %[[AHOST_BOXNONE:.*]] = fir.convert %[[TEMP_BOX]] : (!fir.ref<!fir.box<!fir.array<10xi32>>>) -> !fir.ref<!fir.box<none>>
+// CHECK: fir.call @_FortranACUFDataTransferGlobalDescDesc(%[[ADEV_BOXNONE]], %[[AHOST_BOXNONE]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> none
 
 } // end of module
diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
index 1685265..184abe2 100644
--- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
+++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
@@ -781,11 +781,11 @@ func.func @_QPsimple_reduction(%arg0: !fir.ref<!fir.array<100x!fir.logical<4>>>
 // -----
 
 // CHECK: llvm.func @_QPs
-// CHECK: omp.atomic.read %{{.*}} = %{{.*}}   : !llvm.ptr, !llvm.struct<(f32, f32)>
+// CHECK: omp.atomic.read %{{.*}} = %{{.*}}   : !llvm.ptr, !llvm.ptr, !llvm.struct<(f32, f32)>
 
 func.func @_QPs(%arg0: !fir.ref<complex<f32>> {fir.bindc_name = "x"}) {
   %0 = fir.alloca complex<f32> {bindc_name = "v", uniq_name = "_QFsEv"}
-  omp.atomic.read %0 = %arg0   : !fir.ref<complex<f32>>, complex<f32>
+  omp.atomic.read %0 = %arg0   : !fir.ref<complex<f32>>, !fir.ref<complex<f32>>, complex<f32>
   return
 }
 
diff --git a/flang/test/Integration/debug-complex-2.f90 b/flang/test/Integration/debug-complex-2.f90
new file mode 100644
index 0000000..c5f46d7
--- /dev/null
+++ b/flang/test/Integration/debug-complex-2.f90
@@ -0,0 +1,12 @@
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck  %s
+
+! Test that complex return type is correctly represented in debug.
+complex function fn(a)
+    complex, intent(in) :: a
+    fn = a
+end function
+
+! CHECK-DAG: ![[CMPLX:.*]] = !DIBasicType(name: "complex", size: 64, encoding: DW_ATE_complex_float)
+! CHECK-DAG: ![[SR_TY:.*]] = !DISubroutineType(cc: DW_CC_normal, types: ![[TYPES:.*]])
+! CHECK-DAG: ![[TYPES]] = !{![[CMPLX]], ![[CMPLX]]}
+! CHECK-DAG: !DISubprogram(name: "fn"{{.*}}type: ![[SR_TY]]{{.*}})
diff --git a/flang/test/Integration/debug-external-linkage-name.f90 b/flang/test/Integration/debug-external-linkage-name.f90
new file mode 100644
index 0000000..c7fdf84
--- /dev/null
+++ b/flang/test/Integration/debug-external-linkage-name.f90
@@ -0,0 +1,10 @@
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck  %s
+
+! Test that correct linkage name is generated in the debug info.
+subroutine sub(a)
+  integer :: a
+  return a+1
+end
+
+!CHECK: !DISubprogram(name: "sub", linkageName: "sub_"{{.*}})
+
diff --git a/flang/test/Lower/OpenACC/acc-atomic-capture.f90 b/flang/test/Lower/OpenACC/acc-atomic-capture.f90
index 3736833..797d322 100644
--- a/flang/test/Lower/OpenACC/acc-atomic-capture.f90
+++ b/flang/test/Lower/OpenACC/acc-atomic-capture.f90
@@ -11,7 +11,7 @@ program acc_atomic_capture_test
 !CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %2 {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[temp:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
 !CHECK: acc.atomic.capture {
-!CHECK: acc.atomic.read %[[X_DECL]]#1 = %[[Y_DECL]]#1 : !fir.ref<i32>
+!CHECK: acc.atomic.read %[[X_DECL]]#1 = %[[Y_DECL]]#1 : !fir.ref<i32>, !fir.ref<i32>, i32
 !CHECK: acc.atomic.update %[[Y_DECL]]#1 : !fir.ref<i32> {
 !CHECK: ^bb0(%[[ARG:.*]]: i32):
 !CHECK: %[[result:.*]] = arith.addi %[[temp]], %[[ARG]] : i32
@@ -32,7 +32,7 @@ program acc_atomic_capture_test
 !CHECK: %[[result:.*]] = arith.muli %[[temp]], %[[ARG]] : i32
 !CHECK: acc.yield %[[result]] : i32
 !CHECK: }
-!CHECK: acc.atomic.read %[[X_DECL]]#1 = %[[Y_DECL]]#1 : !fir.ref<i32>
+!CHECK: acc.atomic.read %[[X_DECL]]#1 = %[[Y_DECL]]#1 : !fir.ref<i32>, !fir.ref<i32>, i32
 !CHECK: }
 
     !$acc atomic capture
@@ -47,7 +47,7 @@ program acc_atomic_capture_test
 !CHECK: %[[result_noreassoc:.*]] = hlfir.no_reassoc %[[result]] : i32
 !CHECK: %[[result:.*]] = arith.addi %[[constant_20]], %[[result_noreassoc]] : i32
 !CHECK: acc.atomic.capture {
-!CHECK: acc.atomic.read %[[X_DECL]]#1 = %[[Y_DECL]]#1 : !fir.ref<i32>
+!CHECK: acc.atomic.read %[[X_DECL]]#1 = %[[Y_DECL]]#1 : !fir.ref<i32>, !fir.ref<i32>, i32
 !CHECK: acc.atomic.write %[[Y_DECL]]#1 = %[[result]] : !fir.ref<i32>, i32
 !CHECK: }
 
@@ -82,7 +82,7 @@ subroutine pointers_in_atomic_capture()
 !CHECK: %[[result:.*]] = arith.addi %[[ARG]], %[[loaded_value]] : i32
 !CHECK: acc.yield %[[result]] : i32
 !CHECK: }
-!CHECK: acc.atomic.read %[[loaded_B_addr]] = %[[loaded_A_addr]] : !fir.ptr<i32>, i32
+!CHECK: acc.atomic.read %[[loaded_B_addr]] = %[[loaded_A_addr]] : !fir.ptr<i32>, !fir.ptr<i32>, i32
 !CHECK: }
     integer, pointer :: a, b
     integer, target :: c, d
@@ -118,10 +118,99 @@ end subroutine
 ! CHECK: %[[MUL:.*]] = arith.mulf %{{.*}}, %[[CST]] fastmath<contract> : f32
 ! CHECK: %[[CONV:.*]] = fir.convert %[[MUL]] : (f32) -> i32
 ! CHECK: acc.atomic.capture {
-! CHECK:   acc.atomic.read %[[V_DECL]]#1 = %[[K_DECL]]#1 : !fir.ref<i32>, i32
+! CHECK:   acc.atomic.read %[[V_DECL]]#1 = %[[K_DECL]]#1 : !fir.ref<i32>, !fir.ref<i32>, i32
 ! CHECK:   acc.atomic.write %[[K_DECL]]#1 = %[[CONV]] : !fir.ref<i32>, i32
 ! CHECK: }
 
+subroutine capture_with_convert_i32_to_f64()
+  real(8) :: x
+  integer :: v
+  x = 1.0
+  v = 0
+  !$acc atomic capture
+  v = x
+  x = v
+  !$acc end atomic
+end subroutine capture_with_convert_i32_to_f64
+
+! CHECK-LABEL: func.func @_QPcapture_with_convert_i32_to_f64()
+! CHECK: %[[V:.*]] = fir.alloca i32 {bindc_name = "v", uniq_name = "_QFcapture_with_convert_i32_to_f64Ev"}
+! CHECK: %[[V_DECL:.*]]:2 = hlfir.declare %[[V]] {uniq_name = "_QFcapture_with_convert_i32_to_f64Ev"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[X:.*]] = fir.alloca f64 {bindc_name = "x", uniq_name = "_QFcapture_with_convert_i32_to_f64Ex"}
+! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFcapture_with_convert_i32_to_f64Ex"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
+! CHECK: %[[CST:.*]] = arith.constant 1.000000e+00 : f64
+! CHECK: hlfir.assign %[[CST]] to %[[X_DECL]]#0 : f64, !fir.ref<f64>
+! CHECK: %c0_i32 = arith.constant 0 : i32
+! CHECK: hlfir.assign %c0_i32 to %[[V_DECL]]#0 : i32, !fir.ref<i32>
+! CHECK: %[[LOAD:.*]] = fir.load %[[V_DECL]]#0 : !fir.ref<i32>
+! CHECK: %[[CONV:.*]] = fir.convert %[[LOAD]] : (i32) -> f64
+! CHECK: acc.atomic.capture {
+! CHECK:   acc.atomic.read %[[V_DECL]]#1 = %[[X_DECL]]#1 : !fir.ref<i32>, !fir.ref<f64>, f64
+! CHECK:   acc.atomic.write %[[X_DECL]]#1 = %[[CONV]] : !fir.ref<f64>, f64
+! CHECK: }
+
+subroutine capture_with_convert_f64_to_i32()
+  integer :: x
+  real(8) :: v
+  x = 1
+  v = 0
+  !$acc atomic capture
+  x = v * v
+  v = x
+  !$acc end atomic
+end subroutine capture_with_convert_f64_to_i32
+
+! CHECK-LABEL: func.func @_QPcapture_with_convert_f64_to_i32()
+! CHECK: %[[V:.*]] = fir.alloca f64 {bindc_name = "v", uniq_name = "_QFcapture_with_convert_f64_to_i32Ev"}
+! CHECK: %[[V_DECL:.*]]:2 = hlfir.declare %[[V]] {uniq_name = "_QFcapture_with_convert_f64_to_i32Ev"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
+! CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFcapture_with_convert_f64_to_i32Ex"}
+! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFcapture_with_convert_f64_to_i32Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %c1_i32 = arith.constant 1 : i32
+! CHECK: hlfir.assign %c1_i32 to %[[X_DECL]]#0 : i32, !fir.ref<i32>
+! CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f64
+! CHECK: hlfir.assign %[[CST]] to %[[V_DECL]]#0 : f64, !fir.ref<f64>
+! CHECK: %[[LOAD:.*]] = fir.load %[[V_DECL]]#0 : !fir.ref<f64>
+! CHECK: acc.atomic.capture {
+! CHECK:   acc.atomic.update %[[X_DECL]]#1 : !fir.ref<i32> {
+! CHECK:   ^bb0(%arg0: i32):
+! CHECK:     %[[MUL:.*]] = arith.mulf %[[LOAD]], %[[LOAD]] fastmath<contract> : f64
+! CHECK:     %[[CONV:.*]] = fir.convert %[[MUL]] : (f64) -> i32
+! CHECK:     acc.yield %[[CONV]] : i32
+! CHECK:   }
+! CHECK:   acc.atomic.read %[[V_DECL]]#1 = %[[X_DECL]]#1 : !fir.ref<f64>, !fir.ref<i32>, i32
+! CHECK: }
+
+subroutine capture_with_convert_i32_to_f32()
+  real(4) :: x
+  integer :: v
+  x = 1.0
+  v = 0
+  !$acc atomic capture
+  v = x
+  x = x + v
+  !$acc end atomic
+end subroutine capture_with_convert_i32_to_f32
+
+! CHECK-LABEL: func.func @_QPcapture_with_convert_i32_to_f32()
+! CHECK: %[[V:.*]] = fir.alloca i32 {bindc_name = "v", uniq_name = "_QFcapture_with_convert_i32_to_f32Ev"}
+! CHECK: %[[V_DECL:.*]]:2 = hlfir.declare %[[V]] {uniq_name = "_QFcapture_with_convert_i32_to_f32Ev"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[X:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFcapture_with_convert_i32_to_f32Ex"}
+! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFcapture_with_convert_i32_to_f32Ex"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %[[CST:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK: hlfir.assign %[[CST]] to %[[X_DECL]]#0 : f32, !fir.ref<f32>
+! CHECK: %c0_i32 = arith.constant 0 : i32
+! CHECK: hlfir.assign %c0_i32 to %[[V_DECL]]#0 : i32, !fir.ref<i32>
+! CHECK: %[[LOAD:.*]] = fir.load %[[V_DECL]]#0 : !fir.ref<i32>
+! CHECK: acc.atomic.capture {
+! CHECK:   acc.atomic.read %[[V_DECL]]#1 = %[[X_DECL]]#1 : !fir.ref<i32>, !fir.ref<f32>, f32
+! CHECK:   acc.atomic.update %[[X_DECL]]#1 : !fir.ref<f32> {
+! CHECK:   ^bb0(%arg0: f32):
+! CHECK:     %[[CONV:.*]] = fir.convert %[[LOAD]] : (i32) -> f32
+! CHECK:     %[[ADD:.*]] = arith.addf %arg0, %[[CONV]] fastmath<contract> : f32
+! CHECK:     acc.yield %[[ADD]] : f32
+! CHECK:   }
+! CHECK: }
+
 subroutine array_ref_in_atomic_capture1
   integer :: x(10), v
   !$acc atomic capture
@@ -136,7 +225,7 @@ end subroutine array_ref_in_atomic_capture1
 ! CHECK:           %[[X_DECL:.*]]:2 = hlfir.declare %[[X]](%{{.*}}) {uniq_name = "_QFarray_ref_in_atomic_capture1Ex"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK:           %[[X_REF:.*]] = hlfir.designate %[[X_DECL]]#0 (%{{.*}})  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
 ! CHECK:           acc.atomic.capture {
-! CHECK:             acc.atomic.read %[[V_DECL]]#1 = %[[X_REF]] : !fir.ref<i32>, i32
+! CHECK:             acc.atomic.read %[[V_DECL]]#1 = %[[X_REF]] : !fir.ref<i32>, !fir.ref<i32>, i32
 ! CHECK:             acc.atomic.update %[[X_REF]] : !fir.ref<i32> {
 ! CHECK:             ^bb0(%[[VAL_7:.*]]: i32):
 ! CHECK:               %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %{{.*}} : i32
@@ -163,7 +252,7 @@ end subroutine array_ref_in_atomic_capture2
 ! CHECK:               %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %{{.*}} : i32
 ! CHECK:               acc.yield %[[VAL_8]] : i32
 ! CHECK:             }
-! CHECK:             acc.atomic.read %[[V_DECL]]#1 = %[[X_REF]] : !fir.ref<i32>, i32
+! CHECK:             acc.atomic.read %[[V_DECL]]#1 = %[[X_REF]] : !fir.ref<i32>, !fir.ref<i32>, i32
 ! CHECK:           }
 
 subroutine comp_ref_in_atomic_capture1
@@ -184,7 +273,7 @@ end subroutine comp_ref_in_atomic_capture1
 ! CHECK:           %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFcomp_ref_in_atomic_capture1Ex"} : (!fir.ref<!fir.type<_QFcomp_ref_in_atomic_capture1Tt1{c:i32}>>) -> (!fir.ref<!fir.type<_QFcomp_ref_in_atomic_capture1Tt1{c:i32}>>, !fir.ref<!fir.type<_QFcomp_ref_in_atomic_capture1Tt1{c:i32}>>)
 ! CHECK:           %[[C:.*]] = hlfir.designate %[[X_DECL]]#0{"c"}   : (!fir.ref<!fir.type<_QFcomp_ref_in_atomic_capture1Tt1{c:i32}>>) -> !fir.ref<i32>
 ! CHECK:           acc.atomic.capture {
-! CHECK:             acc.atomic.read %[[V_DECL]]#1 = %[[C]] : !fir.ref<i32>, i32
+! CHECK:             acc.atomic.read %[[V_DECL]]#1 = %[[C]] : !fir.ref<i32>, !fir.ref<i32>, i32
 ! CHECK:             acc.atomic.update %[[C]] : !fir.ref<i32> {
 ! CHECK:             ^bb0(%[[VAL_5:.*]]: i32):
 ! CHECK:               %[[VAL_6:.*]] = arith.addi %[[VAL_5]], %{{.*}} : i32
@@ -215,5 +304,5 @@ end subroutine comp_ref_in_atomic_capture2
 ! CHECK:               %[[VAL_6:.*]] = arith.addi %[[VAL_5]], %{{.*}} : i32
 ! CHECK:               acc.yield %[[VAL_6]] : i32
 ! CHECK:             }
-! CHECK:             acc.atomic.read %[[V_DECL]]#1 = %[[C]] : !fir.ref<i32>, i32
+! CHECK:             acc.atomic.read %[[V_DECL]]#1 = %[[C]] : !fir.ref<i32>, !fir.ref<i32>, i32
 ! CHECK:           }
diff --git a/flang/test/Lower/OpenACC/acc-atomic-read.f90 b/flang/test/Lower/OpenACC/acc-atomic-read.f90
index c1a97a9..f2cbe6e4 100644
--- a/flang/test/Lower/OpenACC/acc-atomic-read.f90
+++ b/flang/test/Lower/OpenACC/acc-atomic-read.f90
@@ -13,7 +13,7 @@ end program acc_atomic_test
 ! CHECK: %[[G_DECL:.*]]:2 = hlfir.declare %[[VAR_G]] {uniq_name = "_QFEg"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK: %[[VAR_H:.*]] = fir.alloca f32 {bindc_name = "h", uniq_name = "_QFEh"}
 ! CHECK: %[[H_DECL:.*]]:2 = hlfir.declare %[[VAR_H]] {uniq_name = "_QFEh"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK: acc.atomic.read %[[G_DECL]]#1 = %[[H_DECL]]#1 : !fir.ref<f32>, f32
+! CHECK: acc.atomic.read %[[G_DECL]]#1 = %[[H_DECL]]#1 : !fir.ref<f32>, !fir.ref<f32>, f32
 ! CHECK: return
 ! CHECK: }
 
@@ -39,10 +39,10 @@ end
 ! CHECK:   %[[BOX_ADDR_X:.*]] = fir.box_addr %[[LOAD_X]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
 ! CHECK:   %[[LOAD_Y:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
 ! CHECK:   %[[BOX_ADDR_Y:.*]] = fir.box_addr %[[LOAD_Y]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
-! CHECK:   acc.atomic.read %[[BOX_ADDR_Y]] = %[[BOX_ADDR_X]] : !fir.ptr<i32>, i32
+! CHECK:   acc.atomic.read %[[BOX_ADDR_Y]] = %[[BOX_ADDR_X]] : !fir.ptr<i32>, !fir.ptr<i32>, i32
 ! CHECK: }
 
-subroutine atomic_read_with_convert()
+subroutine atomic_read_with_cast()
   integer(4) :: x
   integer(8) :: y
 
@@ -50,10 +50,9 @@ subroutine atomic_read_with_convert()
   y = x
 end
 
-! CHECK-LABEL: func.func @_QPatomic_read_with_convert() {
-! CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFatomic_read_with_convertEx"}
-! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFatomic_read_with_convertEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK: %[[Y:.*]] = fir.alloca i64 {bindc_name = "y", uniq_name = "_QFatomic_read_with_convertEy"}
-! CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFatomic_read_with_convertEy"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
-! CHECK: %[[CONV:.*]] = fir.convert %[[X_DECL]]#1 : (!fir.ref<i32>) -> !fir.ref<i64>
-! CHECK: acc.atomic.read %[[Y_DECL]]#1 = %[[CONV]] : !fir.ref<i64>, i32
+! CHECK-LABEL: func.func @_QPatomic_read_with_cast() {
+! CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFatomic_read_with_castEx"}
+! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFatomic_read_with_castEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[Y:.*]] = fir.alloca i64 {bindc_name = "y", uniq_name = "_QFatomic_read_with_castEy"}
+! CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFatomic_read_with_castEy"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: acc.atomic.read %[[Y_DECL]]#1 = %[[X_DECL]]#1 : !fir.ref<i64>, !fir.ref<i32>, i32
diff --git a/flang/test/Lower/OpenACC/acc-atomic-update-array.f90 b/flang/test/Lower/OpenACC/acc-atomic-update-array.f90
index eeb7ea2..f89a9ab 100644
--- a/flang/test/Lower/OpenACC/acc-atomic-update-array.f90
+++ b/flang/test/Lower/OpenACC/acc-atomic-update-array.f90
@@ -45,7 +45,7 @@ end subroutine
 ! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_read_array1Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFatomic_read_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK: %[[DES:.*]] = hlfir.designate %[[DECL_R]]#0 (%{{.*}})  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
-! CHECK: acc.atomic.read %[[DECL_X]]#1 = %[[DES]] : !fir.ref<f32>, f32
+! CHECK: acc.atomic.read %[[DECL_X]]#1 = %[[DES]] : !fir.ref<f32>, !fir.ref<f32>, f32
 
 subroutine atomic_write_array1(r, n, x)
   implicit none
@@ -88,5 +88,5 @@ end subroutine
 ! CHECK:     %[[ADD:.*]] = arith.addf %[[ARG]], %[[LOAD]] fastmath<contract> : f32
 ! CHECK:     acc.yield %[[ADD]] : f32
 ! CHECK:   }
-! CHECK:   acc.atomic.read %[[DECL_Y]]#1 = %[[R_I]] : !fir.ref<f32>, f32
+! CHECK:   acc.atomic.read %[[DECL_Y]]#1 = %[[R_I]] : !fir.ref<f32>, !fir.ref<f32>, f32
 ! CHECK: }
diff --git a/flang/test/Lower/OpenMP/Todo/depend-clause-inoutset.f90 b/flang/test/Lower/OpenMP/Todo/depend-clause-inoutset.f90
new file mode 100644
index 0000000..017df00
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/depend-clause-inoutset.f90
@@ -0,0 +1,11 @@
+!RUN: %not_todo_cmd bbc -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s
+!RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s
+
+!CHECK: not yet implemented: INOUTSET and MUTEXINOUTSET are not supported yet
+subroutine f00(x)
+  integer :: x
+  !$omp task depend(inoutset: x)
+  x = x + 1
+  !$omp end task
+end
+
diff --git a/flang/test/Lower/OpenMP/Todo/depend-clause-mutexinoutset.f90 b/flang/test/Lower/OpenMP/Todo/depend-clause-mutexinoutset.f90
new file mode 100644
index 0000000..2f6ff77
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/depend-clause-mutexinoutset.f90
@@ -0,0 +1,11 @@
+!RUN: %not_todo_cmd bbc -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s
+!RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s
+
+!CHECK: not yet implemented: INOUTSET and MUTEXINOUTSET are not supported yet
+subroutine f00(x)
+  integer :: x
+  !$omp task depend(mutexinoutset: x)
+  x = x + 1
+  !$omp end task
+end
+
diff --git a/flang/test/Lower/OpenMP/Todo/task_detach.f90 b/flang/test/Lower/OpenMP/Todo/task_detach.f90
index 8d0f1c6..6bc55e9 100644
--- a/flang/test/Lower/OpenMP/Todo/task_detach.f90
+++ b/flang/test/Lower/OpenMP/Todo/task_detach.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: openmp_runtime
-! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+! RUN: %not_todo_cmd bbc -emit-fir %openmp_flags -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir %openmp_flags -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
 
 !===============================================================================
 ! `detach` clause
diff --git a/flang/test/Lower/OpenMP/atomic-capture.f90 b/flang/test/Lower/OpenMP/atomic-capture.f90
index af82e4b..679d22d 100644
--- a/flang/test/Lower/OpenMP/atomic-capture.f90
+++ b/flang/test/Lower/OpenMP/atomic-capture.f90
@@ -22,7 +22,7 @@ program OmpAtomicCapture
 !CHECK: %[[TEMP:.*]] = arith.muli %[[VAL_Y_LOADED]], %[[ARG]] : i32
 !CHECK: omp.yield(%[[TEMP]] : i32)
 !CHECK: }
-!CHECK: omp.atomic.read %[[VAL_X_DECLARE]]#1 = %[[VAL_Y_DECLARE]]#1 : !fir.ref<i32>, i32
+!CHECK: omp.atomic.read %[[VAL_X_DECLARE]]#1 = %[[VAL_Y_DECLARE]]#1 : !fir.ref<i32>, !fir.ref<i32>, i32
 !CHECK: }
     !$omp atomic hint(omp_sync_hint_uncontended) capture
         y = x * y 
@@ -36,7 +36,7 @@ program OmpAtomicCapture
 !CHECK: %[[NO_REASSOC:.*]] = hlfir.no_reassoc %[[SUB]] : i32
 !CHECK: %[[ADD:.*]] = arith.addi  %[[VAL_20]], %[[NO_REASSOC]] : i32
 !CHECK: omp.atomic.capture hint(nonspeculative) memory_order(acquire) {
-!CHECK:   omp.atomic.read %[[VAL_X_DECLARE]]#1 = %[[VAL_Y_DECLARE]]#1 : !fir.ref<i32>, i32
+!CHECK:   omp.atomic.read %[[VAL_X_DECLARE]]#1 = %[[VAL_Y_DECLARE]]#1 : !fir.ref<i32>, !fir.ref<i32>, i32
 !CHECK:   omp.atomic.write %[[VAL_Y_DECLARE]]#1 = %[[ADD]] : !fir.ref<i32>, i32
 !CHECK: }
 !CHECK: return
@@ -88,7 +88,7 @@ subroutine pointers_in_atomic_capture()
 !CHECK: %[[TEMP:.*]] = arith.addi %[[ARG]], %[[VAL_B]] : i32
 !CHECK: omp.yield(%[[TEMP]] : i32)
 !CHECK: }
-!CHECK: omp.atomic.read %[[VAL_B_BOX_ADDR]] = %[[VAL_A_BOX_ADDR]] : !fir.ptr<i32>, i32
+!CHECK: omp.atomic.read %[[VAL_B_BOX_ADDR]] = %[[VAL_A_BOX_ADDR]] : !fir.ptr<i32>, !fir.ptr<i32>, i32
 !CHECK: }
 !CHECK: return
 !CHECK: }
diff --git a/flang/test/Lower/OpenMP/atomic-read.f90 b/flang/test/Lower/OpenMP/atomic-read.f90
index c3270dd..e9bea422 100644
--- a/flang/test/Lower/OpenMP/atomic-read.f90
+++ b/flang/test/Lower/OpenMP/atomic-read.f90
@@ -25,12 +25,12 @@
 !CHECK:    %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    %[[Y_REF:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
 !CHECK:    %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_REF]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:    omp.atomic.read %[[X_DECL]]#1 = %[[Y_DECL]]#1   hint(uncontended) memory_order(acquire) : !fir.ref<i32>, i32
-!CHECK:    omp.atomic.read %[[A_DECL]]#1 = %[[B_DECL]]#1   memory_order(relaxed) : !fir.ref<i32>, i32
-!CHECK:    omp.atomic.read %[[C_DECL]]#1 = %[[D_DECL]]#1   hint(contended) memory_order(seq_cst) : !fir.ref<!fir.logical<4>>, !fir.logical<4>
-!CHECK:    omp.atomic.read %[[E_DECL]]#1 = %[[F_DECL]]#1   hint(speculative) : !fir.ref<i32>, i32
-!CHECK:    omp.atomic.read %[[G_DECL]]#1 = %[[H_DECL]]#1   hint(nonspeculative) : !fir.ref<f32>, f32
-!CHECK:    omp.atomic.read %[[G_DECL]]#1 = %[[H_DECL]]#1   : !fir.ref<f32>, f32
+!CHECK:    omp.atomic.read %[[X_DECL]]#1 = %[[Y_DECL]]#1   hint(uncontended) memory_order(acquire) : !fir.ref<i32>, !fir.ref<i32>, i32
+!CHECK:    omp.atomic.read %[[A_DECL]]#1 = %[[B_DECL]]#1   memory_order(relaxed) : !fir.ref<i32>, !fir.ref<i32>, i32
+!CHECK:    omp.atomic.read %[[C_DECL]]#1 = %[[D_DECL]]#1   hint(contended) memory_order(seq_cst) : !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, !fir.logical<4>
+!CHECK:    omp.atomic.read %[[E_DECL]]#1 = %[[F_DECL]]#1   hint(speculative) : !fir.ref<i32>, !fir.ref<i32>, i32
+!CHECK:    omp.atomic.read %[[G_DECL]]#1 = %[[H_DECL]]#1   hint(nonspeculative) : !fir.ref<f32>, !fir.ref<f32>, f32
+!CHECK:    omp.atomic.read %[[G_DECL]]#1 = %[[H_DECL]]#1   : !fir.ref<f32>, !fir.ref<f32>, f32
 
 program OmpAtomic
 
@@ -68,7 +68,7 @@ end program OmpAtomic
 !CHECK:    %[[X_POINTEE_ADDR:.*]] = fir.box_addr %[[X_ADDR]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
 !CHECK:    %[[Y_ADDR:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
 !CHECK:    %[[Y_POINTEE_ADDR:.*]] = fir.box_addr %[[Y_ADDR]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
-!CHECK:    omp.atomic.read %[[Y_POINTEE_ADDR]] = %[[X_POINTEE_ADDR]]   : !fir.ptr<i32>, i32
+!CHECK:    omp.atomic.read %[[Y_POINTEE_ADDR]] = %[[X_POINTEE_ADDR]]   : !fir.ptr<i32>, !fir.ptr<i32>, i32
 !CHECK:    %[[Y_ADDR:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
 !CHECK:    %[[Y_POINTEE_ADDR:.*]] = fir.box_addr %[[Y_ADDR]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
 !CHECK:    %[[Y_POINTEE_VAL:.*]] = fir.load %[[Y_POINTEE_ADDR]] : !fir.ptr<i32>
diff --git a/flang/test/Parser/OpenMP/task.f90 b/flang/test/Parser/OpenMP/task.f90
index c89f9aa..706deb3 100644
--- a/flang/test/Parser/OpenMP/task.f90
+++ b/flang/test/Parser/OpenMP/task.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: openmp_runtime
-! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=50  %s | FileCheck --ignore-case %s
-! RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=50  %s | FileCheck --ignore-case --check-prefix="CHECK-UNPARSE" %s
+! RUN: %flang_fc1 %openmp_flags -fdebug-dump-parse-tree -fopenmp -fopenmp-version=50  %s | FileCheck --ignore-case %s
+! RUN: %flang_fc1 %openmp_flags -fdebug-unparse -fopenmp -fopenmp-version=50  %s | FileCheck --ignore-case --check-prefix="CHECK-UNPARSE" %s
 
 !CHECK: OmpBlockDirective -> llvm::omp::Directive = task
 !CHECK: OmpClauseList -> OmpClause -> Detach -> OmpDetachClause -> OmpObject -> Designator -> DataRef -> Name = 'event'
diff --git a/flang/test/Semantics/OpenMP/depend06.f90 b/flang/test/Semantics/OpenMP/depend06.f90
new file mode 100644
index 0000000..a9668c5
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/depend06.f90
@@ -0,0 +1,17 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=45 -Werror
+
+subroutine f00(x)
+  integer :: x
+!WARNING: INOUTSET task-dependence-type is not supported in OpenMP v4.5, try -fopenmp-version=52
+  !$omp task depend(inoutset: x)
+  x = x + 1
+  !$omp end task
+end
+
+subroutine f01(x)
+  integer :: x
+!WARNING: MUTEXINOUTSET task-dependence-type is not supported in OpenMP v4.5, try -fopenmp-version=50
+  !$omp task depend(mutexinoutset: x)
+  x = x + 1
+  !$omp end task
+end
diff --git a/flang/test/Semantics/OpenMP/depobj-construct-v52.f90 b/flang/test/Semantics/OpenMP/depobj-construct-v52.f90
index f2e6648..3e2345e 100644
--- a/flang/test/Semantics/OpenMP/depobj-construct-v52.f90
+++ b/flang/test/Semantics/OpenMP/depobj-construct-v52.f90
@@ -2,7 +2,7 @@
 
 subroutine f00
   integer :: obj
-!WARNING: The SOURCE task-dependence-type is deprecated in OpenMP v5.2
+!WARNING: SOURCE task-dependence-type is deprecated in OpenMP v5.2
 !ERROR: A DEPEND clause on a DEPOBJ construct must not have SOURCE or SINK as dependence-type
   !$omp depobj(obj) depend(source)
 end
diff --git a/libc/newhdrgen/yaml_to_classes.py b/libc/newhdrgen/yaml_to_classes.py
index a295058..0e8ca2d 100644
--- a/libc/newhdrgen/yaml_to_classes.py
+++ b/libc/newhdrgen/yaml_to_classes.py
@@ -253,7 +253,7 @@ def main():
     args = parser.parse_args()
 
     if args.add_function:
-        add_function_to_yaml(yaml_file, args.add_function)
+        add_function_to_yaml(args.yaml_file, args.add_function)
 
     header_class = GpuHeader if args.export_decls else HeaderFile
     header = load_yaml_file(args.yaml_file, header_class, args.entry_points)
diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index db24b65..3c4a133 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -278,7 +278,7 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_shift``                                        ``201806L``
     ---------------------------------------------------------- -----------------
-    ``__cpp_lib_smart_ptr_for_overwrite``                      *unimplemented*
+    ``__cpp_lib_smart_ptr_for_overwrite``                      ``202002L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_source_location``                              ``201907L``
     ---------------------------------------------------------- -----------------
diff --git a/libcxx/docs/Status/Cxx20Papers.csv b/libcxx/docs/Status/Cxx20Papers.csv
index c8c30a3..66cb101 100644
--- a/libcxx/docs/Status/Cxx20Papers.csv
+++ b/libcxx/docs/Status/Cxx20Papers.csv
@@ -68,7 +68,7 @@
 "`P0972R0 <https://wg21.link/P0972R0>`__","<chrono> ``zero()``\ , ``min()``\ , and ``max()``\  should be noexcept","2018-11 (San Diego)","|Complete|","8",""
 "`P1006R1 <https://wg21.link/P1006R1>`__","Constexpr in std::pointer_traits","2018-11 (San Diego)","|Complete|","8",""
 "`P1007R3 <https://wg21.link/P1007R3>`__","``std::assume_aligned``\ ","2018-11 (San Diego)","|Complete|","15",""
-"`P1020R1 <https://wg21.link/P1020R1>`__","Smart pointer creation with default initialization","2018-11 (San Diego)","|Complete|","16",""
+"`P1020R1 <https://wg21.link/P1020R1>`__","Smart pointer creation with default initialization","2018-11 (San Diego)","|Complete|","16","The feature-test macro was not set until LLVM 20."
 "`P1032R1 <https://wg21.link/P1032R1>`__","Misc constexpr bits","2018-11 (San Diego)","|Complete|","13",""
 "`P1085R2 <https://wg21.link/P1085R2>`__","Should Span be Regular?","2018-11 (San Diego)","|Complete|","8",""
 "`P1123R0 <https://wg21.link/P1123R0>`__","Editorial Guidance for merging P0019r8 and P0528r3","2018-11 (San Diego)","","",""
@@ -177,7 +177,7 @@
 "`P1963R0 <https://wg21.link/P1963R0>`__","Fixing US 313","2020-02 (Prague)","","",""
 "`P1964R2 <https://wg21.link/P1964R2>`__","Wording for boolean-testable","2020-02 (Prague)","|Complete|","13",""
 "`P1970R2 <https://wg21.link/P1970R2>`__","Consistency for size() functions: Add ranges::ssize","2020-02 (Prague)","|Complete|","15",""
-"`P1973R1 <https://wg21.link/P1973R1>`__","Rename ""_default_init"" Functions, Rev1","2020-02 (Prague)","|Complete|","16",""
+"`P1973R1 <https://wg21.link/P1973R1>`__","Rename ""_default_init"" Functions, Rev1","2020-02 (Prague)","|Complete|","16","The feature-test macro was not set until LLVM 20."
 "`P1976R2 <https://wg21.link/P1976R2>`__","Fixed-size span construction from dynamic range","2020-02 (Prague)","|Complete|","11",""
 "`P1981R0 <https://wg21.link/P1981R0>`__","Rename leap to leap_second","2020-02 (Prague)","|Complete|","19",""
 "`P1982R0 <https://wg21.link/P1982R0>`__","Rename link to time_zone_link","2020-02 (Prague)","|Complete|","19",""
diff --git a/libcxx/include/__split_buffer b/libcxx/include/__split_buffer
index 102df63..2a2f262 100644
--- a/libcxx/include/__split_buffer
+++ b/libcxx/include/__split_buffer
@@ -108,12 +108,6 @@ public:
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI ~__split_buffer();
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __alloc_rr& __alloc() _NOEXCEPT { return __alloc_; }
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const __alloc_rr& __alloc() const _NOEXCEPT { return __alloc_; }
-
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI pointer& __end_cap() _NOEXCEPT { return __end_cap_; }
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const pointer& __end_cap() const _NOEXCEPT { return __end_cap_; }
-
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return __begin_; }
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return __begin_; }
 
@@ -129,7 +123,7 @@ public:
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool empty() const { return __end_ == __begin_; }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type capacity() const {
-    return static_cast<size_type>(__end_cap() - __first_);
+    return static_cast<size_type>(__end_cap_ - __first_);
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type __front_spare() const {
@@ -137,7 +131,7 @@ public:
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type __back_spare() const {
-    return static_cast<size_type>(__end_cap() - __end_);
+    return static_cast<size_type>(__end_cap_ - __end_);
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI reference front() { return *__begin_; }
@@ -196,7 +190,7 @@ public:
 private:
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__split_buffer& __c, true_type)
       _NOEXCEPT_(is_nothrow_move_assignable<allocator_type>::value) {
-    __alloc() = std::move(__c.__alloc());
+    __alloc_ = std::move(__c.__alloc_);
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__split_buffer&, false_type) _NOEXCEPT {}
@@ -225,14 +219,14 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __split_buffer<_Tp, _Allocator>::__invariants
       return false;
     if (__end_ != nullptr)
       return false;
-    if (__end_cap() != nullptr)
+    if (__end_cap_ != nullptr)
       return false;
   } else {
     if (__begin_ < __first_)
       return false;
     if (__end_ < __begin_)
       return false;
-    if (__end_cap() < __end_)
+    if (__end_cap_ < __end_)
       return false;
   }
   return true;
@@ -247,7 +241,7 @@ template <class _Tp, class _Allocator>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::__construct_at_end(size_type __n) {
   _ConstructTransaction __tx(&this->__end_, __n);
   for (; __tx.__pos_ != __tx.__end_; ++__tx.__pos_) {
-    __alloc_traits::construct(this->__alloc(), std::__to_address(__tx.__pos_));
+    __alloc_traits::construct(__alloc_, std::__to_address(__tx.__pos_));
   }
 }
 
@@ -262,7 +256,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void
 __split_buffer<_Tp, _Allocator>::__construct_at_end(size_type __n, const_reference __x) {
   _ConstructTransaction __tx(&this->__end_, __n);
   for (; __tx.__pos_ != __tx.__end_; ++__tx.__pos_) {
-    __alloc_traits::construct(this->__alloc(), std::__to_address(__tx.__pos_), __x);
+    __alloc_traits::construct(__alloc_, std::__to_address(__tx.__pos_), __x);
   }
 }
 
@@ -277,14 +271,14 @@ template <class _Tp, class _Allocator>
 template <class _Iterator, class _Sentinel>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 void
 __split_buffer<_Tp, _Allocator>::__construct_at_end_with_sentinel(_Iterator __first, _Sentinel __last) {
-  __alloc_rr& __a = this->__alloc();
+  __alloc_rr& __a = __alloc_;
   for (; __first != __last; ++__first) {
-    if (__end_ == __end_cap()) {
-      size_type __old_cap = __end_cap() - __first_;
+    if (__end_ == __end_cap_) {
+      size_type __old_cap = __end_cap_ - __first_;
       size_type __new_cap = std::max<size_type>(2 * __old_cap, 8);
       __split_buffer __buf(__new_cap, 0, __a);
       for (pointer __p = __begin_; __p != __end_; ++__p, (void)++__buf.__end_)
-        __alloc_traits::construct(__buf.__alloc(), std::__to_address(__buf.__end_), std::move(*__p));
+        __alloc_traits::construct(__buf.__alloc_, std::__to_address(__buf.__end_), std::move(*__p));
       swap(__buf);
     }
     __alloc_traits::construct(__a, std::__to_address(this->__end_), *__first);
@@ -304,7 +298,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void
 __split_buffer<_Tp, _Allocator>::__construct_at_end_with_size(_ForwardIterator __first, size_type __n) {
   _ConstructTransaction __tx(&this->__end_, __n);
   for (; __tx.__pos_ != __tx.__end_; ++__tx.__pos_, (void)++__first) {
-    __alloc_traits::construct(this->__alloc(), std::__to_address(__tx.__pos_), *__first);
+    __alloc_traits::construct(__alloc_, std::__to_address(__tx.__pos_), *__first);
   }
 }
 
@@ -312,7 +306,7 @@ template <class _Tp, class _Allocator>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 inline void
 __split_buffer<_Tp, _Allocator>::__destruct_at_begin(pointer __new_begin, false_type) {
   while (__begin_ != __new_begin)
-    __alloc_traits::destroy(__alloc(), std::__to_address(__begin_++));
+    __alloc_traits::destroy(__alloc_, std::__to_address(__begin_++));
 }
 
 template <class _Tp, class _Allocator>
@@ -325,7 +319,7 @@ template <class _Tp, class _Allocator>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI void
 __split_buffer<_Tp, _Allocator>::__destruct_at_end(pointer __new_last, false_type) _NOEXCEPT {
   while (__new_last != __end_)
-    __alloc_traits::destroy(__alloc(), std::__to_address(--__end_));
+    __alloc_traits::destroy(__alloc_, std::__to_address(--__end_));
 }
 
 template <class _Tp, class _Allocator>
@@ -341,19 +335,19 @@ __split_buffer<_Tp, _Allocator>::__split_buffer(size_type __cap, size_type __sta
   if (__cap == 0) {
     __first_ = nullptr;
   } else {
-    auto __allocation = std::__allocate_at_least(__alloc(), __cap);
+    auto __allocation = std::__allocate_at_least(__alloc_, __cap);
     __first_          = __allocation.ptr;
     __cap             = __allocation.count;
   }
   __begin_ = __end_ = __first_ + __start;
-  __end_cap()       = __first_ + __cap;
+  __end_cap_        = __first_ + __cap;
 }
 
 template <class _Tp, class _Allocator>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 __split_buffer<_Tp, _Allocator>::~__split_buffer() {
   clear();
   if (__first_)
-    __alloc_traits::deallocate(__alloc(), __first_, capacity());
+    __alloc_traits::deallocate(__alloc_, __first_, capacity());
 }
 
 template <class _Tp, class _Allocator>
@@ -364,30 +358,30 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 __split_buffer<_Tp, _Allocator>::__split_buffer(__
       __end_(std::move(__c.__end_)),
       __end_cap_(std::move(__c.__end_cap_)),
       __alloc_(std::move(__c.__alloc_)) {
-  __c.__first_    = nullptr;
-  __c.__begin_    = nullptr;
-  __c.__end_      = nullptr;
-  __c.__end_cap() = nullptr;
+  __c.__first_   = nullptr;
+  __c.__begin_   = nullptr;
+  __c.__end_     = nullptr;
+  __c.__end_cap_ = nullptr;
 }
 
 template <class _Tp, class _Allocator>
 _LIBCPP_CONSTEXPR_SINCE_CXX20
 __split_buffer<_Tp, _Allocator>::__split_buffer(__split_buffer&& __c, const __alloc_rr& __a)
     : __end_cap_(nullptr), __alloc_(__a) {
-  if (__a == __c.__alloc()) {
-    __first_        = __c.__first_;
-    __begin_        = __c.__begin_;
-    __end_          = __c.__end_;
-    __end_cap()     = __c.__end_cap();
-    __c.__first_    = nullptr;
-    __c.__begin_    = nullptr;
-    __c.__end_      = nullptr;
-    __c.__end_cap() = nullptr;
+  if (__a == __c.__alloc_) {
+    __first_       = __c.__first_;
+    __begin_       = __c.__begin_;
+    __end_         = __c.__end_;
+    __end_cap_     = __c.__end_cap_;
+    __c.__first_   = nullptr;
+    __c.__begin_   = nullptr;
+    __c.__end_     = nullptr;
+    __c.__end_cap_ = nullptr;
   } else {
-    auto __allocation = std::__allocate_at_least(__alloc(), __c.size());
+    auto __allocation = std::__allocate_at_least(__alloc_, __c.size());
     __first_          = __allocation.ptr;
     __begin_ = __end_ = __first_;
-    __end_cap()       = __first_ + __allocation.count;
+    __end_cap_        = __first_ + __allocation.count;
     typedef move_iterator<iterator> _Ip;
     __construct_at_end(_Ip(__c.begin()), _Ip(__c.end()));
   }
@@ -401,12 +395,12 @@ __split_buffer<_Tp, _Allocator>::operator=(__split_buffer&& __c)
                !__alloc_traits::propagate_on_container_move_assignment::value) {
   clear();
   shrink_to_fit();
-  __first_    = __c.__first_;
-  __begin_    = __c.__begin_;
-  __end_      = __c.__end_;
-  __end_cap() = __c.__end_cap();
+  __first_   = __c.__first_;
+  __begin_   = __c.__begin_;
+  __end_     = __c.__end_;
+  __end_cap_ = __c.__end_cap_;
   __move_assign_alloc(__c, integral_constant<bool, __alloc_traits::propagate_on_container_move_assignment::value>());
-  __c.__first_ = __c.__begin_ = __c.__end_ = __c.__end_cap() = nullptr;
+  __c.__first_ = __c.__begin_ = __c.__end_ = __c.__end_cap_ = nullptr;
   return *this;
 }
 
@@ -416,19 +410,19 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::swap(__split
   std::swap(__first_, __x.__first_);
   std::swap(__begin_, __x.__begin_);
   std::swap(__end_, __x.__end_);
-  std::swap(__end_cap(), __x.__end_cap());
-  std::__swap_allocator(__alloc(), __x.__alloc());
+  std::swap(__end_cap_, __x.__end_cap_);
+  std::__swap_allocator(__alloc_, __x.__alloc_);
 }
 
 template <class _Tp, class _Allocator>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::reserve(size_type __n) {
   if (__n < capacity()) {
-    __split_buffer<value_type, __alloc_rr&> __t(__n, 0, __alloc());
+    __split_buffer<value_type, __alloc_rr&> __t(__n, 0, __alloc_);
     __t.__construct_at_end(move_iterator<pointer>(__begin_), move_iterator<pointer>(__end_));
     std::swap(__first_, __t.__first_);
     std::swap(__begin_, __t.__begin_);
     std::swap(__end_, __t.__end_);
-    std::swap(__end_cap(), __t.__end_cap());
+    std::swap(__end_cap_, __t.__end_cap_);
   }
 }
 
@@ -438,13 +432,13 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::shrink_to_fi
 #if _LIBCPP_HAS_EXCEPTIONS
     try {
 #endif // _LIBCPP_HAS_EXCEPTIONS
-      __split_buffer<value_type, __alloc_rr&> __t(size(), 0, __alloc());
+      __split_buffer<value_type, __alloc_rr&> __t(size(), 0, __alloc_);
       __t.__construct_at_end(move_iterator<pointer>(__begin_), move_iterator<pointer>(__end_));
       __t.__end_ = __t.__begin_ + (__end_ - __begin_);
       std::swap(__first_, __t.__first_);
       std::swap(__begin_, __t.__begin_);
       std::swap(__end_, __t.__end_);
-      std::swap(__end_cap(), __t.__end_cap());
+      std::swap(__end_cap_, __t.__end_cap_);
 #if _LIBCPP_HAS_EXCEPTIONS
     } catch (...) {
     }
@@ -456,45 +450,45 @@ template <class _Tp, class _Allocator>
 template <class... _Args>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::emplace_front(_Args&&... __args) {
   if (__begin_ == __first_) {
-    if (__end_ < __end_cap()) {
-      difference_type __d = __end_cap() - __end_;
+    if (__end_ < __end_cap_) {
+      difference_type __d = __end_cap_ - __end_;
       __d                 = (__d + 1) / 2;
       __begin_            = std::move_backward(__begin_, __end_, __end_ + __d);
       __end_ += __d;
     } else {
-      size_type __c = std::max<size_type>(2 * static_cast<size_t>(__end_cap() - __first_), 1);
-      __split_buffer<value_type, __alloc_rr&> __t(__c, (__c + 3) / 4, __alloc());
+      size_type __c = std::max<size_type>(2 * static_cast<size_t>(__end_cap_ - __first_), 1);
+      __split_buffer<value_type, __alloc_rr&> __t(__c, (__c + 3) / 4, __alloc_);
       __t.__construct_at_end(move_iterator<pointer>(__begin_), move_iterator<pointer>(__end_));
       std::swap(__first_, __t.__first_);
       std::swap(__begin_, __t.__begin_);
       std::swap(__end_, __t.__end_);
-      std::swap(__end_cap(), __t.__end_cap());
+      std::swap(__end_cap_, __t.__end_cap_);
     }
   }
-  __alloc_traits::construct(__alloc(), std::__to_address(__begin_ - 1), std::forward<_Args>(__args)...);
+  __alloc_traits::construct(__alloc_, std::__to_address(__begin_ - 1), std::forward<_Args>(__args)...);
   --__begin_;
 }
 
 template <class _Tp, class _Allocator>
 template <class... _Args>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::emplace_back(_Args&&... __args) {
-  if (__end_ == __end_cap()) {
+  if (__end_ == __end_cap_) {
     if (__begin_ > __first_) {
       difference_type __d = __begin_ - __first_;
       __d                 = (__d + 1) / 2;
       __end_              = std::move(__begin_, __end_, __begin_ - __d);
       __begin_ -= __d;
     } else {
-      size_type __c = std::max<size_type>(2 * static_cast<size_t>(__end_cap() - __first_), 1);
-      __split_buffer<value_type, __alloc_rr&> __t(__c, __c / 4, __alloc());
+      size_type __c = std::max<size_type>(2 * static_cast<size_t>(__end_cap_ - __first_), 1);
+      __split_buffer<value_type, __alloc_rr&> __t(__c, __c / 4, __alloc_);
       __t.__construct_at_end(move_iterator<pointer>(__begin_), move_iterator<pointer>(__end_));
       std::swap(__first_, __t.__first_);
       std::swap(__begin_, __t.__begin_);
       std::swap(__end_, __t.__end_);
-      std::swap(__end_cap(), __t.__end_cap());
+      std::swap(__end_cap_, __t.__end_cap_);
     }
   }
-  __alloc_traits::construct(__alloc(), std::__to_address(__end_), std::forward<_Args>(__args)...);
+  __alloc_traits::construct(__alloc_, std::__to_address(__end_), std::forward<_Args>(__args)...);
   ++__end_;
 }
 
diff --git a/libcxx/include/__vector/vector.h b/libcxx/include/__vector/vector.h
index 173ef1d..876490a 100644
--- a/libcxx/include/__vector/vector.h
+++ b/libcxx/include/__vector/vector.h
@@ -822,7 +822,7 @@ vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer<value_type, a
   __end_       = __begin_; // All the objects have been destroyed by relocating them.
   std::swap(this->__begin_, __v.__begin_);
   std::swap(this->__end_, __v.__end_);
-  std::swap(this->__end_cap(), __v.__end_cap());
+  std::swap(this->__end_cap(), __v.__end_cap_);
   __v.__first_ = __v.__begin_;
   __annotate_new(size());
 }
@@ -852,7 +852,7 @@ vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer<value_type, a
 
   std::swap(this->__begin_, __v.__begin_);
   std::swap(this->__end_, __v.__end_);
-  std::swap(this->__end_cap(), __v.__end_cap());
+  std::swap(this->__end_cap(), __v.__end_cap_);
   __v.__first_ = __v.__begin_;
   __annotate_new(size());
   return __ret;
diff --git a/libcxx/include/deque b/libcxx/include/deque
index 1a83465..0ded69c 100644
--- a/libcxx/include/deque
+++ b/libcxx/include/deque
@@ -1229,8 +1229,8 @@ private:
       clear();
       shrink_to_fit();
     }
-    __alloc()        = __c.__alloc();
-    __map_.__alloc() = __c.__map_.__alloc();
+    __alloc()       = __c.__alloc();
+    __map_.__alloc_ = __c.__map_.__alloc_;
   }
 
   _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const deque&, false_type) {}
@@ -1309,7 +1309,7 @@ deque<_Tp, _Allocator>::deque(const deque& __c)
     : __map_(__pointer_allocator(__alloc_traits::select_on_container_copy_construction(__c.__alloc()))),
       __start_(0),
       __size_(0),
-      __alloc_(__map_.__alloc()) {
+      __alloc_(__map_.__alloc_) {
   __annotate_new(0);
   __append(__c.begin(), __c.end());
 }
@@ -2061,7 +2061,7 @@ void deque<_Tp, _Allocator>::__add_front_capacity() {
   // Else need to allocate 1 buffer, *and* we need to reallocate __map_.
   else {
     __split_buffer<pointer, __pointer_allocator&> __buf(
-        std::max<size_type>(2 * __map_.capacity(), 1), 0, __map_.__alloc());
+        std::max<size_type>(2 * __map_.capacity(), 1), 0, __map_.__alloc_);
 
     typedef __allocator_destructor<_Allocator> _Dp;
     unique_ptr<pointer, _Dp> __hold(__alloc_traits::allocate(__a, __block_size), _Dp(__a, __block_size));
@@ -2073,7 +2073,7 @@ void deque<_Tp, _Allocator>::__add_front_capacity() {
     std::swap(__map_.__first_, __buf.__first_);
     std::swap(__map_.__begin_, __buf.__begin_);
     std::swap(__map_.__end_, __buf.__end_);
-    std::swap(__map_.__end_cap(), __buf.__end_cap());
+    std::swap(__map_.__end_cap_, __buf.__end_cap_);
     __start_ = __map_.size() == 1 ? __block_size / 2 : __start_ + __block_size;
   }
   __annotate_whole_block(0, __asan_poison);
@@ -2124,7 +2124,7 @@ void deque<_Tp, _Allocator>::__add_front_capacity(size_type __n) {
   else {
     size_type __ds = (__nb + __back_capacity) * __block_size - __map_.empty();
     __split_buffer<pointer, __pointer_allocator&> __buf(
-        std::max<size_type>(2 * __map_.capacity(), __nb + __map_.size()), 0, __map_.__alloc());
+        std::max<size_type>(2 * __map_.capacity(), __nb + __map_.size()), 0, __map_.__alloc_);
 #if _LIBCPP_HAS_EXCEPTIONS
     try {
 #endif // _LIBCPP_HAS_EXCEPTIONS
@@ -2150,7 +2150,7 @@ void deque<_Tp, _Allocator>::__add_front_capacity(size_type __n) {
     std::swap(__map_.__first_, __buf.__first_);
     std::swap(__map_.__begin_, __buf.__begin_);
     std::swap(__map_.__end_, __buf.__end_);
-    std::swap(__map_.__end_cap(), __buf.__end_cap());
+    std::swap(__map_.__end_cap_, __buf.__end_cap_);
     __start_ += __ds;
   }
 }
@@ -2184,7 +2184,7 @@ void deque<_Tp, _Allocator>::__add_back_capacity() {
   // Else need to allocate 1 buffer, *and* we need to reallocate __map_.
   else {
     __split_buffer<pointer, __pointer_allocator&> __buf(
-        std::max<size_type>(2 * __map_.capacity(), 1), __map_.size(), __map_.__alloc());
+        std::max<size_type>(2 * __map_.capacity(), 1), __map_.size(), __map_.__alloc_);
 
     typedef __allocator_destructor<_Allocator> _Dp;
     unique_ptr<pointer, _Dp> __hold(__alloc_traits::allocate(__a, __block_size), _Dp(__a, __block_size));
@@ -2196,7 +2196,7 @@ void deque<_Tp, _Allocator>::__add_back_capacity() {
     std::swap(__map_.__first_, __buf.__first_);
     std::swap(__map_.__begin_, __buf.__begin_);
     std::swap(__map_.__end_, __buf.__end_);
-    std::swap(__map_.__end_cap(), __buf.__end_cap());
+    std::swap(__map_.__end_cap_, __buf.__end_cap_);
     __annotate_whole_block(__map_.size() - 1, __asan_poison);
   }
 }
@@ -2249,7 +2249,7 @@ void deque<_Tp, _Allocator>::__add_back_capacity(size_type __n) {
     __split_buffer<pointer, __pointer_allocator&> __buf(
         std::max<size_type>(2 * __map_.capacity(), __nb + __map_.size()),
         __map_.size() - __front_capacity,
-        __map_.__alloc());
+        __map_.__alloc_);
 #if _LIBCPP_HAS_EXCEPTIONS
     try {
 #endif // _LIBCPP_HAS_EXCEPTIONS
@@ -2275,7 +2275,7 @@ void deque<_Tp, _Allocator>::__add_back_capacity(size_type __n) {
     std::swap(__map_.__first_, __buf.__first_);
     std::swap(__map_.__begin_, __buf.__begin_);
     std::swap(__map_.__end_, __buf.__end_);
-    std::swap(__map_.__end_cap(), __buf.__end_cap());
+    std::swap(__map_.__end_cap_, __buf.__end_cap_);
     __start_ -= __ds;
   }
 }
diff --git a/libcxx/include/string b/libcxx/include/string
index 20e44ea..e4b2d7a 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -2083,6 +2083,8 @@ private:
     size_type __guess          = __align_it<__boundary>(__s + 1) - 1;
     if (__guess == __min_cap)
       __guess += __endian_factor;
+
+    _LIBCPP_ASSERT_INTERNAL(__guess >= __s, "recommendation is below the requested size");
     return __guess;
   }
 
@@ -3346,12 +3348,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::re
   if (__requested_capacity <= capacity())
     return;
 
-  size_type __target_capacity = std::max(__requested_capacity, size());
-  __target_capacity           = __recommend(__target_capacity);
-  if (__target_capacity == capacity())
-    return;
-
-  __shrink_or_extend(__target_capacity);
+  __shrink_or_extend(__recommend(__requested_capacity));
 }
 
 template <class _CharT, class _Traits, class _Allocator>
diff --git a/libcxx/include/version b/libcxx/include/version
index cb75f3b..571b6e3 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -443,7 +443,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # undef  __cpp_lib_shared_ptr_arrays
 # define __cpp_lib_shared_ptr_arrays                    201707L
 # define __cpp_lib_shift                                201806L
-// # define __cpp_lib_smart_ptr_for_overwrite              202002L
+# define __cpp_lib_smart_ptr_for_overwrite              202002L
 # define __cpp_lib_source_location                      201907L
 # define __cpp_lib_span                                 202002L
 # define __cpp_lib_ssize                                201902L
diff --git a/libcxx/test/benchmarks/CartesianBenchmarks.h b/libcxx/test/benchmarks/CartesianBenchmarks.h
index eca4e15..c712230 100644
--- a/libcxx/test/benchmarks/CartesianBenchmarks.h
+++ b/libcxx/test/benchmarks/CartesianBenchmarks.h
@@ -27,11 +27,11 @@ constexpr auto makeEnumValueTuple(std::index_sequence<Idxs...>) {
 }
 
 template <class B>
-static auto skip(const B& Bench, int) -> decltype(Bench.skip()) {
+auto skip(const B& Bench, int) -> decltype(Bench.skip()) {
   return Bench.skip();
 }
 template <class B>
-static auto skip(const B& Bench, char) {
+auto skip(const B&, char) {
   return false;
 }
 
@@ -51,7 +51,7 @@ void makeBenchmarkFromValues(const std::vector<std::tuple<Args...> >& A) {
 }
 
 template <template <class...> class B, class Args, class... U>
-void makeBenchmarkImpl(const Args& A, std::tuple<U...> t) {
+void makeBenchmarkImpl(const Args& A, std::tuple<U...>) {
   makeBenchmarkFromValues<B<U...> >(A);
 }
 
diff --git a/libcxx/test/benchmarks/ContainerBenchmarks.h b/libcxx/test/benchmarks/ContainerBenchmarks.h
index 5404814..742c848 100644
--- a/libcxx/test/benchmarks/ContainerBenchmarks.h
+++ b/libcxx/test/benchmarks/ContainerBenchmarks.h
@@ -150,7 +150,7 @@ void BM_EmplaceDuplicate(benchmark::State& st, Container c, GenInputs gen) {
 }
 
 template <class Container, class GenInputs>
-static void BM_Find(benchmark::State& st, Container c, GenInputs gen) {
+void BM_Find(benchmark::State& st, Container c, GenInputs gen) {
   auto in = gen(st.range(0));
   c.insert(in.begin(), in.end());
   benchmark::DoNotOptimize(&(*c.begin()));
@@ -164,7 +164,7 @@ static void BM_Find(benchmark::State& st, Container c, GenInputs gen) {
 }
 
 template <class Container, class GenInputs>
-static void BM_FindRehash(benchmark::State& st, Container c, GenInputs gen) {
+void BM_FindRehash(benchmark::State& st, Container c, GenInputs gen) {
   c.rehash(8);
   auto in = gen(st.range(0));
   c.insert(in.begin(), in.end());
@@ -179,7 +179,7 @@ static void BM_FindRehash(benchmark::State& st, Container c, GenInputs gen) {
 }
 
 template <class Container, class GenInputs>
-static void BM_Rehash(benchmark::State& st, Container c, GenInputs gen) {
+void BM_Rehash(benchmark::State& st, Container c, GenInputs gen) {
   auto in = gen(st.range(0));
   c.max_load_factor(3.0);
   c.insert(in.begin(), in.end());
@@ -193,7 +193,7 @@ static void BM_Rehash(benchmark::State& st, Container c, GenInputs gen) {
 }
 
 template <class Container, class GenInputs>
-static void BM_Compare_same_container(benchmark::State& st, Container, GenInputs gen) {
+void BM_Compare_same_container(benchmark::State& st, Container, GenInputs gen) {
   auto in = gen(st.range(0));
   Container c1(in.begin(), in.end());
   Container c2 = c1;
@@ -208,7 +208,7 @@ static void BM_Compare_same_container(benchmark::State& st, Container, GenInputs
 }
 
 template <class Container, class GenInputs>
-static void BM_Compare_different_containers(benchmark::State& st, Container, GenInputs gen) {
+void BM_Compare_different_containers(benchmark::State& st, Container, GenInputs gen) {
   auto in1 = gen(st.range(0));
   auto in2 = gen(st.range(0));
   Container c1(in1.begin(), in1.end());
diff --git a/libcxx/test/benchmarks/VariantBenchmarks.h b/libcxx/test/benchmarks/VariantBenchmarks.h
index ad36b59..bb0c714 100644
--- a/libcxx/test/benchmarks/VariantBenchmarks.h
+++ b/libcxx/test/benchmarks/VariantBenchmarks.h
@@ -35,7 +35,7 @@ static auto genVariants(std::index_sequence<Is...>) {
 
   std::array<V, N> result = {};
   for (auto& v : result) {
-    v = fs[getRandomInteger(0ul, sizeof...(Is) - 1)]();
+    v = fs[getRandomInteger(std::size_t(0), sizeof...(Is) - 1)]();
   }
 
   return result;
diff --git a/libcxx/test/benchmarks/algorithms.partition_point.bench.cpp b/libcxx/test/benchmarks/algorithms.partition_point.bench.cpp
index ed2e337..42ebce8 100644
--- a/libcxx/test/benchmarks/algorithms.partition_point.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms.partition_point.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include <algorithm>
 #include <array>
 #include <cassert>
diff --git a/libcxx/test/benchmarks/algorithms/count.bench.cpp b/libcxx/test/benchmarks/algorithms/count.bench.cpp
index 7370293..46b85e9 100644
--- a/libcxx/test/benchmarks/algorithms/count.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/count.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <algorithm>
 #include <benchmark/benchmark.h>
 #include <cstring>
diff --git a/libcxx/test/benchmarks/algorithms/equal.bench.cpp b/libcxx/test/benchmarks/algorithms/equal.bench.cpp
index 6d63d8c..2dc1158 100644
--- a/libcxx/test/benchmarks/algorithms/equal.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/equal.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <algorithm>
 #include <benchmark/benchmark.h>
 #include <vector>
diff --git a/libcxx/test/benchmarks/algorithms/fill.bench.cpp b/libcxx/test/benchmarks/algorithms/fill.bench.cpp
index 40f3742..c157b5e 100644
--- a/libcxx/test/benchmarks/algorithms/fill.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/fill.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
 #include <algorithm>
 #include <benchmark/benchmark.h>
 #include <vector>
diff --git a/libcxx/test/benchmarks/algorithms/find.bench.cpp b/libcxx/test/benchmarks/algorithms/find.bench.cpp
index 6ff2d95..43d1034 100644
--- a/libcxx/test/benchmarks/algorithms/find.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/find.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <algorithm>
 #include <benchmark/benchmark.h>
 #include <cstring>
diff --git a/libcxx/test/benchmarks/algorithms/for_each.bench.cpp b/libcxx/test/benchmarks/algorithms/for_each.bench.cpp
index 7019dc1..554c9ec 100644
--- a/libcxx/test/benchmarks/algorithms/for_each.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/for_each.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include <algorithm>
 #include <benchmark/benchmark.h>
 #include <deque>
diff --git a/libcxx/test/benchmarks/algorithms/lexicographical_compare.bench.cpp b/libcxx/test/benchmarks/algorithms/lexicographical_compare.bench.cpp
index 0c54526..e9d7710b 100755
--- a/libcxx/test/benchmarks/algorithms/lexicographical_compare.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/lexicographical_compare.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <algorithm>
 #include <benchmark/benchmark.h>
 #include <vector>
diff --git a/libcxx/test/benchmarks/algorithms/lower_bound.bench.cpp b/libcxx/test/benchmarks/algorithms/lower_bound.bench.cpp
index 3be5010..d9d5796 100644
--- a/libcxx/test/benchmarks/algorithms/lower_bound.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/lower_bound.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include <algorithm>
 #include <numeric>
 #include <random>
diff --git a/libcxx/test/benchmarks/algorithms/make_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/make_heap.bench.cpp
index dade7b8..b7320e1 100644
--- a/libcxx/test/benchmarks/algorithms/make_heap.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/make_heap.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include <algorithm>
 
 #include "common.h"
diff --git a/libcxx/test/benchmarks/algorithms/make_heap_then_sort_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/make_heap_then_sort_heap.bench.cpp
index 48f34f8..5991d28 100644
--- a/libcxx/test/benchmarks/algorithms/make_heap_then_sort_heap.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/make_heap_then_sort_heap.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include <algorithm>
 
 #include "common.h"
diff --git a/libcxx/test/benchmarks/algorithms/min.bench.cpp b/libcxx/test/benchmarks/algorithms/min.bench.cpp
index a09bd53..ee426c7 100644
--- a/libcxx/test/benchmarks/algorithms/min.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/min.bench.cpp
@@ -6,10 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <algorithm>
 #include <cassert>
 
 #include <benchmark/benchmark.h>
+#include "test_macros.h"
 
 void run_sizes(auto benchmark) {
   benchmark->Arg(1)
@@ -68,7 +71,9 @@ BENCHMARK(BM_std_min<char>)->Apply(run_sizes);
 BENCHMARK(BM_std_min<short>)->Apply(run_sizes);
 BENCHMARK(BM_std_min<int>)->Apply(run_sizes);
 BENCHMARK(BM_std_min<long long>)->Apply(run_sizes);
+#ifndef TEST_HAS_NO_INT128
 BENCHMARK(BM_std_min<__int128>)->Apply(run_sizes);
+#endif
 BENCHMARK(BM_std_min<unsigned char>)->Apply(run_sizes);
 BENCHMARK(BM_std_min<unsigned short>)->Apply(run_sizes);
 BENCHMARK(BM_std_min<unsigned int>)->Apply(run_sizes);
diff --git a/libcxx/test/benchmarks/algorithms/min_max_element.bench.cpp b/libcxx/test/benchmarks/algorithms/min_max_element.bench.cpp
index e2c6423..4dbfed6 100644
--- a/libcxx/test/benchmarks/algorithms/min_max_element.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/min_max_element.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include <algorithm>
 
 #include "common.h"
diff --git a/libcxx/test/benchmarks/algorithms/minmax.bench.cpp b/libcxx/test/benchmarks/algorithms/minmax.bench.cpp
index ca1cdb4..7d7c74c 100644
--- a/libcxx/test/benchmarks/algorithms/minmax.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/minmax.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <algorithm>
 #include <cassert>
 
diff --git a/libcxx/test/benchmarks/algorithms/mismatch.bench.cpp b/libcxx/test/benchmarks/algorithms/mismatch.bench.cpp
index 7917828..348009a 100644
--- a/libcxx/test/benchmarks/algorithms/mismatch.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/mismatch.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include <algorithm>
 #include <benchmark/benchmark.h>
 #include <random>
diff --git a/libcxx/test/benchmarks/algorithms/pop_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/pop_heap.bench.cpp
index 26cdd25..5fef522 100644
--- a/libcxx/test/benchmarks/algorithms/pop_heap.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/pop_heap.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include <algorithm>
 
 #include "common.h"
diff --git a/libcxx/test/benchmarks/algorithms/pstl.stable_sort.bench.cpp b/libcxx/test/benchmarks/algorithms/pstl.stable_sort.bench.cpp
index 72541f7..10254ac 100644
--- a/libcxx/test/benchmarks/algorithms/pstl.stable_sort.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/pstl.stable_sort.bench.cpp
@@ -6,6 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: libcpp-has-no-incomplete-pstl
+
 #include <algorithm>
 #include <execution>
 
diff --git a/libcxx/test/benchmarks/algorithms/push_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/push_heap.bench.cpp
index ba96fa1..89d8122 100644
--- a/libcxx/test/benchmarks/algorithms/push_heap.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/push_heap.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include <algorithm>
 
 #include "common.h"
diff --git a/libcxx/test/benchmarks/algorithms/ranges_contains.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_contains.bench.cpp
index f36ebff..b98e17a 100644
--- a/libcxx/test/benchmarks/algorithms/ranges_contains.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/ranges_contains.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
 #include <algorithm>
 #include <benchmark/benchmark.h>
 #include <iterator>
diff --git a/libcxx/test/benchmarks/algorithms/ranges_ends_with.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_ends_with.bench.cpp
index 049af7c2..c975d16 100644
--- a/libcxx/test/benchmarks/algorithms/ranges_ends_with.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/ranges_ends_with.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
 #include <algorithm>
 #include <benchmark/benchmark.h>
 #include <iterator>
diff --git a/libcxx/test/benchmarks/algorithms/ranges_make_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_make_heap.bench.cpp
index 66a8335..c04ea36 100644
--- a/libcxx/test/benchmarks/algorithms/ranges_make_heap.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/ranges_make_heap.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <algorithm>
 
 #include "common.h"
diff --git a/libcxx/test/benchmarks/algorithms/ranges_make_heap_then_sort_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_make_heap_then_sort_heap.bench.cpp
index 01632c8..b84d3c2 100644
--- a/libcxx/test/benchmarks/algorithms/ranges_make_heap_then_sort_heap.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/ranges_make_heap_then_sort_heap.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <algorithm>
 
 #include "common.h"
diff --git a/libcxx/test/benchmarks/algorithms/ranges_pop_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_pop_heap.bench.cpp
index bcc7a83..ab3ae6f 100644
--- a/libcxx/test/benchmarks/algorithms/ranges_pop_heap.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/ranges_pop_heap.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <algorithm>
 
 #include "common.h"
diff --git a/libcxx/test/benchmarks/algorithms/ranges_push_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_push_heap.bench.cpp
index 902f481..8139ba3 100644
--- a/libcxx/test/benchmarks/algorithms/ranges_push_heap.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/ranges_push_heap.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <algorithm>
 
 #include "common.h"
diff --git a/libcxx/test/benchmarks/algorithms/ranges_sort.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_sort.bench.cpp
index aeb2aed..d145a15 100644
--- a/libcxx/test/benchmarks/algorithms/ranges_sort.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/ranges_sort.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <algorithm>
 
 #include "common.h"
diff --git a/libcxx/test/benchmarks/algorithms/ranges_sort_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_sort_heap.bench.cpp
index 62c607c..90e9b9a 100644
--- a/libcxx/test/benchmarks/algorithms/ranges_sort_heap.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/ranges_sort_heap.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <algorithm>
 
 #include "common.h"
diff --git a/libcxx/test/benchmarks/algorithms/ranges_stable_sort.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_stable_sort.bench.cpp
index 8832748..acc2f3f 100644
--- a/libcxx/test/benchmarks/algorithms/ranges_stable_sort.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/ranges_stable_sort.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <algorithm>
 
 #include "common.h"
diff --git a/libcxx/test/benchmarks/algorithms/set_intersection.bench.cpp b/libcxx/test/benchmarks/algorithms/set_intersection.bench.cpp
index b3fb15f..9bde4bb 100644
--- a/libcxx/test/benchmarks/algorithms/set_intersection.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/set_intersection.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include <algorithm>
 #include <cstdlib>
 #include <iterator>
diff --git a/libcxx/test/benchmarks/algorithms/sort.bench.cpp b/libcxx/test/benchmarks/algorithms/sort.bench.cpp
index f87434b..899272e 100644
--- a/libcxx/test/benchmarks/algorithms/sort.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/sort.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include <algorithm>
 
 #include "common.h"
diff --git a/libcxx/test/benchmarks/algorithms/sort_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/sort_heap.bench.cpp
index 1372b4d..ee4b6bfc 100644
--- a/libcxx/test/benchmarks/algorithms/sort_heap.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/sort_heap.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include <algorithm>
 
 #include "common.h"
diff --git a/libcxx/test/benchmarks/algorithms/stable_sort.bench.cpp b/libcxx/test/benchmarks/algorithms/stable_sort.bench.cpp
index 024a036..c68f738 100644
--- a/libcxx/test/benchmarks/algorithms/stable_sort.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/stable_sort.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include <algorithm>
 
 #include "common.h"
diff --git a/libcxx/test/benchmarks/allocation.bench.cpp b/libcxx/test/benchmarks/allocation.bench.cpp
index 1d0c71f..66a9b88 100644
--- a/libcxx/test/benchmarks/allocation.bench.cpp
+++ b/libcxx/test/benchmarks/allocation.bench.cpp
@@ -6,12 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: -fsized-deallocation
+// ADDITIONAL_COMPILE_FLAGS: -fsized-deallocation
+
 #include "benchmark/benchmark.h"
 
 #include <cassert>
+#include <cstdlib>
 #include <new>
 #include <vector>
 
+#include "test_macros.h"
+
 struct PointerList {
   PointerList* Next = nullptr;
 };
@@ -26,6 +32,7 @@ struct NewWrapper {
   __attribute__((always_inline)) static void Deallocate(void* P, size_t) { ::operator delete(P); }
 };
 
+#ifdef TEST_COMPILER_CLANG
 struct BuiltinNewWrapper {
   __attribute__((always_inline)) static void* Allocate(size_t N) { return __builtin_operator_new(N); }
   __attribute__((always_inline)) static void Deallocate(void* P, size_t) { __builtin_operator_delete(P); }
@@ -35,6 +42,7 @@ struct BuiltinSizedNewWrapper {
   __attribute__((always_inline)) static void* Allocate(size_t N) { return __builtin_operator_new(N); }
   __attribute__((always_inline)) static void Deallocate(void* P, size_t N) { __builtin_operator_delete(P, N); }
 };
+#endif
 
 template <class AllocWrapper>
 static void BM_AllocateAndDeallocate(benchmark::State& st) {
@@ -93,11 +101,12 @@ static int RegisterAllocBenchmarks() {
   } TestCases[] = {
       {"BM_Malloc", &BM_AllocateAndDeallocate<MallocWrapper>},
       {"BM_New", &BM_AllocateAndDeallocate<NewWrapper>},
+#ifdef TEST_COMPILER_CLANG
       {"BM_BuiltinNewDelete", BM_AllocateAndDeallocate<BuiltinNewWrapper>},
       {"BM_BuiltinSizedNewDelete", BM_AllocateAndDeallocate<BuiltinSizedNewWrapper>},
       {"BM_BuiltinNewAllocateOnly", BM_AllocateOnly<BuiltinSizedNewWrapper>},
       {"BM_BuiltinNewSizedDeallocateOnly", BM_DeallocateOnly<BuiltinSizedNewWrapper>},
-
+#endif
   };
   for (auto TC : TestCases) {
     benchmark::RegisterBenchmark(TC.name, TC.func)->Range(16, 4096 * 2);
diff --git a/libcxx/test/benchmarks/atomic_wait.bench.cpp b/libcxx/test/benchmarks/atomic_wait.bench.cpp
index dd541b4..49503a3 100644
--- a/libcxx/test/benchmarks/atomic_wait.bench.cpp
+++ b/libcxx/test/benchmarks/atomic_wait.bench.cpp
@@ -6,8 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <atomic>
 #include <numeric>
+#include <stop_token>
 #include <thread>
 
 #include "benchmark/benchmark.h"
diff --git a/libcxx/test/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp b/libcxx/test/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp
index 1a52e5d..b6f7f40 100644
--- a/libcxx/test/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp
+++ b/libcxx/test/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 // To run this test, build libcxx and cxx-benchmarks targets
 // cd third-party/benchmark/tools
 // ./compare.py filters ../../../build/libcxx/benchmarks/atomic_wait_vs_mutex_lock.libcxx.out BM_atomic_wait BM_mutex
@@ -13,6 +15,7 @@
 #include <atomic>
 #include <mutex>
 #include <numeric>
+#include <stop_token>
 #include <thread>
 
 #include "benchmark/benchmark.h"
diff --git a/libcxx/test/benchmarks/deque.bench.cpp b/libcxx/test/benchmarks/deque.bench.cpp
index d6dadaa..b8f3b76 100644
--- a/libcxx/test/benchmarks/deque.bench.cpp
+++ b/libcxx/test/benchmarks/deque.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
 #include <deque>
 
 #include "benchmark/benchmark.h"
diff --git a/libcxx/test/benchmarks/deque_iterator.bench.cpp b/libcxx/test/benchmarks/deque_iterator.bench.cpp
index 0eb23f2..d1db8ed 100644
--- a/libcxx/test/benchmarks/deque_iterator.bench.cpp
+++ b/libcxx/test/benchmarks/deque_iterator.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <algorithm>
 #include <deque>
 
diff --git a/libcxx/test/benchmarks/exception_ptr.bench.cpp b/libcxx/test/benchmarks/exception_ptr.bench.cpp
index 1292ad7..7791c51 100644
--- a/libcxx/test/benchmarks/exception_ptr.bench.cpp
+++ b/libcxx/test/benchmarks/exception_ptr.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03
+
 #include <benchmark/benchmark.h>
 #include <exception>
 
diff --git a/libcxx/test/benchmarks/filesystem.bench.cpp b/libcxx/test/benchmarks/filesystem.bench.cpp
index 19f9586..83a87c86 100644
--- a/libcxx/test/benchmarks/filesystem.bench.cpp
+++ b/libcxx/test/benchmarks/filesystem.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include <filesystem>
 
 #include "GenerateInput.h"
diff --git a/libcxx/test/benchmarks/format.bench.cpp b/libcxx/test/benchmarks/format.bench.cpp
index 22f369f..267ef22 100644
--- a/libcxx/test/benchmarks/format.bench.cpp
+++ b/libcxx/test/benchmarks/format.bench.cpp
@@ -6,12 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <format>
 
 #include <string>
 
 #include "benchmark/benchmark.h"
 #include "make_string.h"
+#include "test_macros.h"
 
 #define CSTR(S) MAKE_CSTRING(CharT, S)
 
@@ -28,6 +31,8 @@ static void BM_format_string(benchmark::State& state) {
   state.SetBytesProcessed(state.iterations() * size * sizeof(CharT));
 }
 BENCHMARK(BM_format_string<char>)->RangeMultiplier(2)->Range(1, 1 << 20);
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
 BENCHMARK(BM_format_string<wchar_t>)->RangeMultiplier(2)->Range(1, 1 << 20);
+#endif
 
 BENCHMARK_MAIN();
diff --git a/libcxx/test/benchmarks/format/write_double_comparison.bench.cpp b/libcxx/test/benchmarks/format/write_double_comparison.bench.cpp
index 93db5f1..9bd4c5c 100644
--- a/libcxx/test/benchmarks/format/write_double_comparison.bench.cpp
+++ b/libcxx/test/benchmarks/format/write_double_comparison.bench.cpp
@@ -6,6 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// Don't warn about std::sprintf
+// ADDITIONAL_COMPILE_FLAGS: -Wno-deprecated
+
 #include <array>
 #include <charconv>
 #include <cstdio>
@@ -29,7 +34,7 @@ static void BM_sprintf(benchmark::State& state) {
   std::array<char, 100> output;
   while (state.KeepRunningBatch(data.size()))
     for (auto value : data) {
-      sprintf(output.data(), "%f", value);
+      std::sprintf(output.data(), "%f", value);
       benchmark::DoNotOptimize(output.data());
     }
 }
diff --git a/libcxx/test/benchmarks/format/write_int_comparison.bench.cpp b/libcxx/test/benchmarks/format/write_int_comparison.bench.cpp
index 835a56f..e71e545 100644
--- a/libcxx/test/benchmarks/format/write_int_comparison.bench.cpp
+++ b/libcxx/test/benchmarks/format/write_int_comparison.bench.cpp
@@ -6,6 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// Don't warn about std::sprintf
+// ADDITIONAL_COMPILE_FLAGS: -Wno-deprecated
+
 #include <array>
 #include <charconv>
 #include <cstdio>
@@ -29,7 +34,7 @@ static void BM_sprintf(benchmark::State& state) {
   std::array<char, 100> output;
   while (state.KeepRunningBatch(data.size()))
     for (auto value : data) {
-      sprintf(output.data(), "%d", value);
+      std::sprintf(output.data(), "%d", value);
       benchmark::DoNotOptimize(output.data());
     }
 }
diff --git a/libcxx/test/benchmarks/format/write_string_comparison.bench.cpp b/libcxx/test/benchmarks/format/write_string_comparison.bench.cpp
index bbd49f7..3e76b62 100644
--- a/libcxx/test/benchmarks/format/write_string_comparison.bench.cpp
+++ b/libcxx/test/benchmarks/format/write_string_comparison.bench.cpp
@@ -6,6 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// Don't warn about std::sprintf
+// ADDITIONAL_COMPILE_FLAGS: -Wno-deprecated
+
 #include <array>
 #include <concepts>
 #include <cstdio>
@@ -135,7 +140,7 @@ std::string_view string_view_6000_characters = c_string_6000_characters;
 static void BM_sprintf(benchmark::State& state, const char* value) {
   std::array<char, 10'000> output;
   for (auto _ : state)
-    benchmark::DoNotOptimize(sprintf(output.data(), "%s", value));
+    benchmark::DoNotOptimize(std::sprintf(output.data(), "%s", value));
 }
 
 template <class T>
@@ -191,18 +196,6 @@ static void BM_format_to_iterator(benchmark::State& state, const T& value, F&& f
                                                                                                                        \
   /* */
 
-// Verify these types have an iterator that format has optimizations for.
-LIBCPP_STATIC_ASSERT(std::same_as<std::array<char, 1>::iterator, char*> || // the type depends on an ABI flag
-                     std::same_as<std::array<char, 1>::iterator, std::__wrap_iter<char*>>);
-LIBCPP_STATIC_ASSERT(std::same_as<std::string::iterator, std::__wrap_iter<char*>>);
-LIBCPP_STATIC_ASSERT(std::same_as<std::vector<char>::iterator, std::__wrap_iter<char*>>);
-
-// Verify these types have an iterator that format does not optimize for
-LIBCPP_STATIC_ASSERT(!std::same_as<std::deque<char>::iterator, char*> &&
-                     !std::same_as<std::deque<char>::iterator, std::__wrap_iter<char*>>);
-LIBCPP_STATIC_ASSERT(!std::same_as<std::list<char>::iterator, char*> &&
-                     !std::same_as<std::list<char>::iterator, std::__wrap_iter<char*>>);
-
 BENCHMARK_CAPTURE(BM_sprintf, C_string_len_6, c_string_6_characters);
 FORMAT_BENCHMARKS(C_string_len_6, c_string_6_characters)
 FORMAT_BENCHMARKS(string_len_6, string_6_characters)
diff --git a/libcxx/test/benchmarks/format_to.bench.cpp b/libcxx/test/benchmarks/format_to.bench.cpp
index e8fc6c8..5b06e82 100644
--- a/libcxx/test/benchmarks/format_to.bench.cpp
+++ b/libcxx/test/benchmarks/format_to.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <format>
 
 #include <algorithm>
@@ -18,6 +20,7 @@
 
 #include "benchmark/benchmark.h"
 #include "make_string.h"
+#include "test_macros.h"
 
 #define CSTR(S) MAKE_CSTRING(CharT, S)
 
@@ -90,6 +93,7 @@ BENCHMARK(BM_format_to_string_begin<std::list<char>>)->RangeMultiplier(2)->Range
 BENCHMARK(BM_format_to_string_span<char>)->RangeMultiplier(2)->Range(1, 1 << 20);
 BENCHMARK(BM_format_to_string_pointer<char>)->RangeMultiplier(2)->Range(1, 1 << 20);
 
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
 BENCHMARK(BM_format_to_string_back_inserter<std::wstring>)->RangeMultiplier(2)->Range(1, 1 << 20);
 BENCHMARK(BM_format_to_string_back_inserter<std::vector<wchar_t>>)->RangeMultiplier(2)->Range(1, 1 << 20);
 BENCHMARK(BM_format_to_string_back_inserter<std::list<wchar_t>>)->RangeMultiplier(2)->Range(1, 1 << 20);
@@ -98,5 +102,6 @@ BENCHMARK(BM_format_to_string_begin<std::vector<wchar_t>>)->RangeMultiplier(2)->
 BENCHMARK(BM_format_to_string_begin<std::list<wchar_t>>)->RangeMultiplier(2)->Range(1, 1 << 20);
 BENCHMARK(BM_format_to_string_span<wchar_t>)->RangeMultiplier(2)->Range(1, 1 << 20);
 BENCHMARK(BM_format_to_string_pointer<wchar_t>)->RangeMultiplier(2)->Range(1, 1 << 20);
+#endif
 
 BENCHMARK_MAIN();
diff --git a/libcxx/test/benchmarks/format_to_n.bench.cpp b/libcxx/test/benchmarks/format_to_n.bench.cpp
index 96386ff..30f6ce7 100644
--- a/libcxx/test/benchmarks/format_to_n.bench.cpp
+++ b/libcxx/test/benchmarks/format_to_n.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <format>
 
 #include <algorithm>
@@ -18,6 +20,7 @@
 
 #include "benchmark/benchmark.h"
 #include "make_string.h"
+#include "test_macros.h"
 
 #define CSTR(S) MAKE_CSTRING(CharT, S)
 
@@ -90,6 +93,7 @@ BENCHMARK(BM_format_to_n_string_begin<std::list<char>>)->RangeMultiplier(2)->Ran
 BENCHMARK(BM_format_to_n_string_span<char>)->RangeMultiplier(2)->Range(1, 1 << 20);
 BENCHMARK(BM_format_to_n_string_pointer<char>)->RangeMultiplier(2)->Range(1, 1 << 20);
 
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
 BENCHMARK(BM_format_to_n_string_back_inserter<std::wstring>)->RangeMultiplier(2)->Range(1, 1 << 20);
 BENCHMARK(BM_format_to_n_string_back_inserter<std::vector<wchar_t>>)->RangeMultiplier(2)->Range(1, 1 << 20);
 BENCHMARK(BM_format_to_n_string_back_inserter<std::list<wchar_t>>)->RangeMultiplier(2)->Range(1, 1 << 20);
@@ -98,5 +102,6 @@ BENCHMARK(BM_format_to_n_string_begin<std::vector<wchar_t>>)->RangeMultiplier(2)
 BENCHMARK(BM_format_to_n_string_begin<std::list<wchar_t>>)->RangeMultiplier(2)->Range(1, 1 << 20);
 BENCHMARK(BM_format_to_n_string_span<wchar_t>)->RangeMultiplier(2)->Range(1, 1 << 20);
 BENCHMARK(BM_format_to_n_string_pointer<wchar_t>)->RangeMultiplier(2)->Range(1, 1 << 20);
+#endif
 
 BENCHMARK_MAIN();
diff --git a/libcxx/test/benchmarks/formatted_size.bench.cpp b/libcxx/test/benchmarks/formatted_size.bench.cpp
index 9af2343..e244f0b 100644
--- a/libcxx/test/benchmarks/formatted_size.bench.cpp
+++ b/libcxx/test/benchmarks/formatted_size.bench.cpp
@@ -6,12 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <format>
 
 #include <string>
 
 #include "benchmark/benchmark.h"
 #include "make_string.h"
+#include "test_macros.h"
 
 #define CSTR(S) MAKE_CSTRING(CharT, S)
 
@@ -26,6 +29,8 @@ static void BM_formatted_size_string(benchmark::State& state) {
   state.SetBytesProcessed(state.iterations() * size * sizeof(CharT));
 }
 BENCHMARK(BM_formatted_size_string<char>)->RangeMultiplier(2)->Range(1, 1 << 20);
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
 BENCHMARK(BM_formatted_size_string<wchar_t>)->RangeMultiplier(2)->Range(1, 1 << 20);
+#endif
 
 BENCHMARK_MAIN();
diff --git a/libcxx/test/benchmarks/formatter_float.bench.cpp b/libcxx/test/benchmarks/formatter_float.bench.cpp
index d1da585..ec20eab 100644
--- a/libcxx/test/benchmarks/formatter_float.bench.cpp
+++ b/libcxx/test/benchmarks/formatter_float.bench.cpp
@@ -6,9 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <format>
 
 #include <array>
+#include <bit>
+#include <cmath>
 #include <limits>
 #include <random>
 #include <string>
@@ -90,9 +94,9 @@ struct Value<ValueE::Random> {
     std::array<F, 1000> result;
     std::generate(result.begin(), result.end(), [&] {
       while (true) {
-        auto result = std::bit_cast<F>(distribution(generator));
-        if (std::isfinite(result))
-          return result;
+        auto val = std::bit_cast<F>(distribution(generator));
+        if (std::isfinite(val))
+          return val;
       }
     });
     return result;
diff --git a/libcxx/test/benchmarks/formatter_int.bench.cpp b/libcxx/test/benchmarks/formatter_int.bench.cpp
index a02b3ac..68f0f18 100644
--- a/libcxx/test/benchmarks/formatter_int.bench.cpp
+++ b/libcxx/test/benchmarks/formatter_int.bench.cpp
@@ -6,12 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <array>
 #include <format>
 #include <random>
 
 #include "CartesianBenchmarks.h"
 #include "benchmark/benchmark.h"
+#include "test_macros.h"
 
 // Tests the full range of the value.
 template <class T>
@@ -49,11 +52,13 @@ static void BM_BasicLow(benchmark::State& state) {
     for (auto value : data)
       benchmark::DoNotOptimize(std::format_to(output.begin(), "{}", value));
 }
+#ifndef TEST_HAS_NO_INT128
 BENCHMARK(BM_BasicLow<__uint128_t>);
 BENCHMARK(BM_BasicLow<__int128_t>);
 
 BENCHMARK(BM_Basic<__uint128_t>);
 BENCHMARK(BM_Basic<__int128_t>);
+#endif
 
 // *** Localization ***
 enum class LocalizationE { False, True };
diff --git a/libcxx/test/benchmarks/function.bench.cpp b/libcxx/test/benchmarks/function.bench.cpp
index dd397bc..e607162 100644
--- a/libcxx/test/benchmarks/function.bench.cpp
+++ b/libcxx/test/benchmarks/function.bench.cpp
@@ -6,10 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
 #include <cstdint>
 #include <functional>
 #include <memory>
 #include <string>
+#include <utility>
 
 #include "CartesianBenchmarks.h"
 #include "benchmark/benchmark.h"
@@ -101,6 +104,7 @@ inline Function MakeFunction(FunctionType type, bool opaque = false) {
   case FunctionType::LargeNonTrivialFunctor:
     return maybeOpaque(LargeNonTrivialFunctor{}, opaque);
   }
+  std::unreachable();
 }
 
 template <class Opacity, class FunctionType>
@@ -179,7 +183,7 @@ template <class FunctionType>
 struct Invoke {
   static void run(benchmark::State& state) {
     S s;
-    const auto value = MakeFunction(FunctionType());
+    auto value = MakeFunction(FunctionType());
     for (auto _ : state) {
       benchmark::DoNotOptimize(value);
       benchmark::DoNotOptimize(value(&s));
diff --git a/libcxx/test/benchmarks/hash.bench.cpp b/libcxx/test/benchmarks/hash.bench.cpp
index e015987..1e1a0f3 100644
--- a/libcxx/test/benchmarks/hash.bench.cpp
+++ b/libcxx/test/benchmarks/hash.bench.cpp
@@ -19,38 +19,6 @@
 
 constexpr std::size_t TestNumInputs = 1024;
 
-template <class _Size>
-inline TEST_ALWAYS_INLINE _Size loadword(const void* __p) {
-  _Size __r;
-  std::memcpy(&__r, __p, sizeof(__r));
-  return __r;
-}
-
-inline TEST_ALWAYS_INLINE std::size_t hash_len_16(std::size_t __u, std::size_t __v) {
-  const std::size_t __mul = 0x9ddfea08eb382d69ULL;
-  std::size_t __a         = (__u ^ __v) * __mul;
-  __a ^= (__a >> 47);
-  std::size_t __b = (__v ^ __a) * __mul;
-  __b ^= (__b >> 47);
-  __b *= __mul;
-  return __b;
-}
-
-template <std::size_t _Len>
-inline TEST_ALWAYS_INLINE std::size_t hash_len_0_to_8(const char* __s) {
-  static_assert(_Len == 4 || _Len == 8, "");
-  const uint64_t __a = loadword<uint32_t>(__s);
-  const uint64_t __b = loadword<uint32_t>(__s + _Len - 4);
-  return hash_len_16(_Len + (__a << 3), __b);
-}
-
-struct UInt32Hash {
-  UInt32Hash() = default;
-  inline TEST_ALWAYS_INLINE std::size_t operator()(uint32_t data) const {
-    return hash_len_0_to_8<4>(reinterpret_cast<const char*>(&data));
-  }
-};
-
 template <class HashFn, class GenInputs>
 void BM_Hash(benchmark::State& st, HashFn fn, GenInputs gen) {
   auto in               = gen(st.range(0));
@@ -68,13 +36,7 @@ void BM_Hash(benchmark::State& st, HashFn fn, GenInputs gen) {
 BENCHMARK_CAPTURE(BM_Hash, uint32_random_std_hash, std::hash<uint32_t>{}, getRandomIntegerInputs<uint32_t>)
     ->Arg(TestNumInputs);
 
-BENCHMARK_CAPTURE(BM_Hash, uint32_random_custom_hash, UInt32Hash{}, getRandomIntegerInputs<uint32_t>)
-    ->Arg(TestNumInputs);
-
 BENCHMARK_CAPTURE(BM_Hash, uint32_top_std_hash, std::hash<uint32_t>{}, getSortedTopBitsIntegerInputs<uint32_t>)
     ->Arg(TestNumInputs);
 
-BENCHMARK_CAPTURE(BM_Hash, uint32_top_custom_hash, UInt32Hash{}, getSortedTopBitsIntegerInputs<uint32_t>)
-    ->Arg(TestNumInputs);
-
 BENCHMARK_MAIN();
diff --git a/libcxx/test/benchmarks/join_view.bench.cpp b/libcxx/test/benchmarks/join_view.bench.cpp
index c789a39..9f6db4a 100644
--- a/libcxx/test/benchmarks/join_view.bench.cpp
+++ b/libcxx/test/benchmarks/join_view.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <algorithm>
 #include <deque>
 #include <ranges>
diff --git a/libcxx/test/benchmarks/lexicographical_compare_three_way.bench.cpp b/libcxx/test/benchmarks/lexicographical_compare_three_way.bench.cpp
index 072935c..a0b33d2 100644
--- a/libcxx/test/benchmarks/lexicographical_compare_three_way.bench.cpp
+++ b/libcxx/test/benchmarks/lexicographical_compare_three_way.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <algorithm>
 
 #include "benchmark/benchmark.h"
diff --git a/libcxx/test/benchmarks/libcxxabi/dynamic_cast.bench.cpp b/libcxx/test/benchmarks/libcxxabi/dynamic_cast.bench.cpp
index 439eea8..43fe318 100644
--- a/libcxx/test/benchmarks/libcxxabi/dynamic_cast.bench.cpp
+++ b/libcxx/test/benchmarks/libcxxabi/dynamic_cast.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03
+
 #include <cstddef>
 
 #include "benchmark/benchmark.h"
diff --git a/libcxx/test/benchmarks/libcxxabi/dynamic_cast_old_stress.bench.cpp b/libcxx/test/benchmarks/libcxxabi/dynamic_cast_old_stress.bench.cpp
index df4daf7..e79d865 100644
--- a/libcxx/test/benchmarks/libcxxabi/dynamic_cast_old_stress.bench.cpp
+++ b/libcxx/test/benchmarks/libcxxabi/dynamic_cast_old_stress.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11
+
 #include <cassert>
 #include <cstddef>
 #include <utility>
diff --git a/libcxx/test/benchmarks/map.bench.cpp b/libcxx/test/benchmarks/map.bench.cpp
index 255164b..81bdc50 100644
--- a/libcxx/test/benchmarks/map.bench.cpp
+++ b/libcxx/test/benchmarks/map.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include <algorithm>
 #include <cstdint>
 #include <map>
diff --git a/libcxx/test/benchmarks/monotonic_buffer.bench.cpp b/libcxx/test/benchmarks/monotonic_buffer.bench.cpp
index 39bb853..66ab580 100644
--- a/libcxx/test/benchmarks/monotonic_buffer.bench.cpp
+++ b/libcxx/test/benchmarks/monotonic_buffer.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include <list>
 #include <memory_resource>
 
diff --git a/libcxx/test/benchmarks/numeric/gcd.bench.cpp b/libcxx/test/benchmarks/numeric/gcd.bench.cpp
index f8b6a85..abbc7e9 100644
--- a/libcxx/test/benchmarks/numeric/gcd.bench.cpp
+++ b/libcxx/test/benchmarks/numeric/gcd.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include <array>
 #include <benchmark/benchmark.h>
 #include <cstring>
@@ -41,7 +43,8 @@ static void bm_gcd_trivial(benchmark::State& state) {
 BENCHMARK(bm_gcd_trivial);
 
 static void bm_gcd_complex(benchmark::State& state) {
-  int lhs = 2971215073, rhs = 1836311903;
+  long long lhs = 2971215073;
+  long long rhs = 1836311903;
   for (auto _ : state) {
     benchmark::DoNotOptimize(lhs);
     benchmark::DoNotOptimize(rhs);
diff --git a/libcxx/test/benchmarks/ordered_set.bench.cpp b/libcxx/test/benchmarks/ordered_set.bench.cpp
index 22540d8..7883233 100644
--- a/libcxx/test/benchmarks/ordered_set.bench.cpp
+++ b/libcxx/test/benchmarks/ordered_set.bench.cpp
@@ -6,9 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include <algorithm>
 #include <cstdint>
 #include <memory>
+#include <numeric>
 #include <random>
 #include <set>
 #include <string>
@@ -181,7 +184,7 @@ struct IterateRangeFor : Base {
     while (State.KeepRunningBatch(TableSize * NumTables)) {
       for (auto& Set : Data.Sets) {
         for (auto& V : Set) {
-          benchmark::DoNotOptimize(V);
+          benchmark::DoNotOptimize(const_cast<std::set<uint64_t>::reference>(V));
         }
       }
     }
@@ -199,7 +202,7 @@ struct IterateBeginEnd : Base {
     while (State.KeepRunningBatch(TableSize * NumTables)) {
       for (auto& Set : Data.Sets) {
         for (auto it = Set.begin(); it != Set.end(); ++it) {
-          benchmark::DoNotOptimize(*it);
+          benchmark::DoNotOptimize(const_cast<std::set<uint64_t>::reference>(*it));
         }
       }
     }
diff --git a/libcxx/test/benchmarks/random.bench.cpp b/libcxx/test/benchmarks/random.bench.cpp
index 0645a4e..e6af4c3 100644
--- a/libcxx/test/benchmarks/random.bench.cpp
+++ b/libcxx/test/benchmarks/random.bench.cpp
@@ -6,9 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03
+
 #include <algorithm>
 #include <array>
 #include <cstddef>
+#include <cstdint>
 #include <functional>
 #include <random>
 
diff --git a/libcxx/test/benchmarks/shared_mutex_vs_mutex.bench.cpp b/libcxx/test/benchmarks/shared_mutex_vs_mutex.bench.cpp
index 5482935..84a49a8 100644
--- a/libcxx/test/benchmarks/shared_mutex_vs_mutex.bench.cpp
+++ b/libcxx/test/benchmarks/shared_mutex_vs_mutex.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 // This benchmark compares the performance of std::mutex and std::shared_mutex in contended scenarios.
 // it's meant to establish a baseline overhead for std::shared_mutex and std::mutex, and to help inform decisions about
 // which mutex to use when selecting a mutex type for a given use case.
diff --git a/libcxx/test/benchmarks/std_format_spec_string_unicode.bench.cpp b/libcxx/test/benchmarks/std_format_spec_string_unicode.bench.cpp
index e0e29cf..775f3ed 100644
--- a/libcxx/test/benchmarks/std_format_spec_string_unicode.bench.cpp
+++ b/libcxx/test/benchmarks/std_format_spec_string_unicode.bench.cpp
@@ -4,14 +4,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #ifndef _LIBCPP_HAS_NO_UNICODE
 
+#  include <concepts>
 #  include <format>
 #  include <string_view>
 
 #  include "benchmark/benchmark.h"
-
 #  include "make_string.h"
+#  include "test_macros.h"
 
 #  define SV(S) MAKE_STRING_VIEW(CharT, S)
 
@@ -282,11 +285,13 @@ BENCHMARK(BM_cyrillic_text<char>);
 BENCHMARK(BM_japanese_text<char>);
 BENCHMARK(BM_emoji_text<char>);
 
+#  ifndef TEST_HAS_NO_WIDE_CHARACTERS
 BENCHMARK(BM_ascii_text<wchar_t>);
 BENCHMARK(BM_unicode_text<wchar_t>);
 BENCHMARK(BM_cyrillic_text<wchar_t>);
 BENCHMARK(BM_japanese_text<wchar_t>);
 BENCHMARK(BM_emoji_text<wchar_t>);
+#  endif
 
 BENCHMARK_MAIN();
 
diff --git a/libcxx/test/benchmarks/std_format_spec_string_unicode_escape.bench.cpp b/libcxx/test/benchmarks/std_format_spec_string_unicode_escape.bench.cpp
index c030ab6..0b0cd64c 100644
--- a/libcxx/test/benchmarks/std_format_spec_string_unicode_escape.bench.cpp
+++ b/libcxx/test/benchmarks/std_format_spec_string_unicode_escape.bench.cpp
@@ -6,18 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
 // This test formats a larger piece of text in "escaped" mode. It uses several
 // datasets to give an impression how the amount of multibyte UTF-8 sequences
 // and larger grapheme clusters affect the performance.
 
 #ifndef _LIBCPP_HAS_NO_UNICODE
 
+#  include <concepts>
 #  include <format>
 #  include <string_view>
 
 #  include "benchmark/benchmark.h"
-
 #  include "make_string.h"
+#  include "test_macros.h"
 
 #  define SV(S) MAKE_STRING_VIEW(CharT, S)
 
@@ -285,11 +288,13 @@ BENCHMARK(BM_cyrillic_escaped<char>);
 BENCHMARK(BM_japanese_escaped<char>);
 BENCHMARK(BM_emoji_escaped<char>);
 
+#  ifndef TEST_HAS_NO_WIDE_CHARACTERS
 BENCHMARK(BM_ascii_escaped<wchar_t>);
 BENCHMARK(BM_unicode_escaped<wchar_t>);
 BENCHMARK(BM_cyrillic_escaped<wchar_t>);
 BENCHMARK(BM_japanese_escaped<wchar_t>);
 BENCHMARK(BM_emoji_escaped<wchar_t>);
+#  endif
 
 BENCHMARK_MAIN();
 
diff --git a/libcxx/test/benchmarks/stop_token.bench.cpp b/libcxx/test/benchmarks/stop_token.bench.cpp
index 6be4736..6149f91 100644
--- a/libcxx/test/benchmarks/stop_token.bench.cpp
+++ b/libcxx/test/benchmarks/stop_token.bench.cpp
@@ -6,7 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
 #include <numeric>
+#include <optional>
 #include <stop_token>
 #include <thread>
 
diff --git a/libcxx/test/benchmarks/string.bench.cpp b/libcxx/test/benchmarks/string.bench.cpp
index 49d3722..0d7ce2b 100644
--- a/libcxx/test/benchmarks/string.bench.cpp
+++ b/libcxx/test/benchmarks/string.bench.cpp
@@ -6,7 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include <cstdint>
+#include <cstdlib>
 #include <new>
 #include <vector>
 
@@ -92,6 +95,7 @@ TEST_ALWAYS_INLINE const char* getSmallString(DiffType D) {
   case DiffType::ChangeLast:
     return "01234567-";
   }
+  __builtin_unreachable();
 }
 
 static constexpr char LargeStringLiteral[] = "012345678901234567890123456789012345678901234567890123456789012";
@@ -109,6 +113,7 @@ TEST_ALWAYS_INLINE const char* getLargeString(DiffType D) {
   case DiffType::ChangeLast:
     return "0" LARGE_STRING_FIRST "1" LARGE_STRING_SECOND "-";
   }
+  __builtin_unreachable();
 }
 
 TEST_ALWAYS_INLINE const char* getHugeString(DiffType D) {
@@ -127,6 +132,7 @@ TEST_ALWAYS_INLINE const char* getHugeString(DiffType D) {
   case DiffType::ChangeLast:
     return "0123456789" HUGE_STRING4 "0123456789" HUGE_STRING4 "012345678-";
   }
+  __builtin_unreachable();
 }
 
 TEST_ALWAYS_INLINE const char* getString(Length L, DiffType D = DiffType::Control) {
@@ -140,6 +146,7 @@ TEST_ALWAYS_INLINE const char* getString(Length L, DiffType D = DiffType::Contro
   case Length::Huge:
     return getHugeString(D);
   }
+  __builtin_unreachable();
 }
 
 TEST_ALWAYS_INLINE std::string makeString(Length L, DiffType D = DiffType::Control, Opacity O = Opacity::Transparent) {
@@ -153,6 +160,7 @@ TEST_ALWAYS_INLINE std::string makeString(Length L, DiffType D = DiffType::Contr
   case Length::Huge:
     return maybeOpaque(getHugeString(D), O == Opacity::Opaque);
   }
+  __builtin_unreachable();
 }
 
 template <class Length, class Opaque>
@@ -170,13 +178,13 @@ template <class Length, bool MeasureCopy, bool MeasureDestroy>
 static void StringCopyAndDestroy(benchmark::State& state) {
   static constexpr size_t NumStrings = 1024;
   auto Orig                          = makeString(Length());
-  std::aligned_storage<sizeof(std::string)>::type Storage[NumStrings];
+  alignas(std::string) char Storage[NumStrings * sizeof(std::string)];
 
   while (state.KeepRunningBatch(NumStrings)) {
     if (!MeasureCopy)
       state.PauseTiming();
     for (size_t I = 0; I < NumStrings; ++I) {
-      ::new (static_cast<void*>(Storage + I)) std::string(Orig);
+      ::new (reinterpret_cast<std::string*>(Storage) + I) std::string(Orig);
     }
     if (!MeasureCopy)
       state.ResumeTiming();
@@ -184,7 +192,7 @@ static void StringCopyAndDestroy(benchmark::State& state) {
       state.PauseTiming();
     for (size_t I = 0; I < NumStrings; ++I) {
       using S = std::string;
-      reinterpret_cast<S*>(Storage + I)->~S();
+      (reinterpret_cast<S*>(Storage) + I)->~S();
     }
     if (!MeasureDestroy)
       state.ResumeTiming();
@@ -209,16 +217,16 @@ template <class Length>
 struct StringMove {
   static void run(benchmark::State& state) {
     // Keep two object locations and move construct back and forth.
-    std::aligned_storage<sizeof(std::string), alignof(std::string)>::type Storage[2];
+    alignas(std::string) char Storage[2 * sizeof(std::string)];
     using S  = std::string;
     size_t I = 0;
-    S* newS  = new (static_cast<void*>(Storage)) std::string(makeString(Length()));
+    S* newS  = new (reinterpret_cast<std::string*>(Storage)) std::string(makeString(Length()));
     for (auto _ : state) {
       // Switch locations.
       I ^= 1;
       benchmark::DoNotOptimize(Storage);
       // Move construct into the new location,
-      S* tmpS = new (static_cast<void*>(Storage + I)) S(std::move(*newS));
+      S* tmpS = new (reinterpret_cast<std::string*>(Storage) + I) S(std::move(*newS));
       // then destroy the old one.
       newS->~S();
       newS = tmpS;
@@ -481,14 +489,14 @@ struct StringRead {
     for (auto _ : state) {
       // Jump long enough to defeat cache locality, and use a value that is
       // coprime with NumStrings to ensure we visit every element.
-      I             = (I + 17) % NumStrings;
-      const auto& V = Values[I];
+      I       = (I + 17) % NumStrings;
+      auto& V = Values[I];
 
       // Read everything first. Escaping data() through DoNotOptimize might
       // cause the compiler to have to recalculate information about `V` due to
       // aliasing.
-      const char* const Data = V.data();
-      const size_t Size      = V.size();
+      char* Data  = V.data();
+      size_t Size = V.size();
       benchmark::DoNotOptimize(Data);
       benchmark::DoNotOptimize(Size);
       if (Depth() == ::Depth::Deep) {
diff --git a/libcxx/test/benchmarks/stringstream.bench.cpp b/libcxx/test/benchmarks/stringstream.bench.cpp
index a333900..b7c50a9 100644
--- a/libcxx/test/benchmarks/stringstream.bench.cpp
+++ b/libcxx/test/benchmarks/stringstream.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include "benchmark/benchmark.h"
 #include "test_macros.h"
 
diff --git a/libcxx/test/benchmarks/system_error.bench.cpp b/libcxx/test/benchmarks/system_error.bench.cpp
index 4b0568d..8506efe 100644
--- a/libcxx/test/benchmarks/system_error.bench.cpp
+++ b/libcxx/test/benchmarks/system_error.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03
+
 #include <string>
 #include <system_error>
 
diff --git a/libcxx/test/benchmarks/to_chars.bench.cpp b/libcxx/test/benchmarks/to_chars.bench.cpp
index f79dfda..cee969c 100644
--- a/libcxx/test/benchmarks/to_chars.bench.cpp
+++ b/libcxx/test/benchmarks/to_chars.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include <array>
 #include <charconv>
 #include <random>
diff --git a/libcxx/test/benchmarks/unordered_set_operations.bench.cpp b/libcxx/test/benchmarks/unordered_set_operations.bench.cpp
index 2e42d6d..7b1700b 100644
--- a/libcxx/test/benchmarks/unordered_set_operations.bench.cpp
+++ b/libcxx/test/benchmarks/unordered_set_operations.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
@@ -23,77 +25,9 @@ using namespace ContainerBenchmarks;
 
 constexpr std::size_t TestNumInputs = 1024;
 
-template <class _Size>
-inline TEST_ALWAYS_INLINE _Size loadword(const void* __p) {
-  _Size __r;
-  std::memcpy(&__r, __p, sizeof(__r));
-  return __r;
-}
-
-inline TEST_ALWAYS_INLINE std::size_t rotate_by_at_least_1(std::size_t __val, int __shift) {
-  return (__val >> __shift) | (__val << (64 - __shift));
-}
-
-inline TEST_ALWAYS_INLINE std::size_t hash_len_16(std::size_t __u, std::size_t __v) {
-  const std::size_t __mul = 0x9ddfea08eb382d69ULL;
-  std::size_t __a         = (__u ^ __v) * __mul;
-  __a ^= (__a >> 47);
-  std::size_t __b = (__v ^ __a) * __mul;
-  __b ^= (__b >> 47);
-  __b *= __mul;
-  return __b;
-}
-
-template <std::size_t _Len>
-inline TEST_ALWAYS_INLINE std::size_t hash_len_0_to_8(const char* __s) {
-  static_assert(_Len == 4 || _Len == 8, "");
-  const uint64_t __a = loadword<uint32_t>(__s);
-  const uint64_t __b = loadword<uint32_t>(__s + _Len - 4);
-  return hash_len_16(_Len + (__a << 3), __b);
-}
-
-struct UInt32Hash {
-  UInt32Hash() = default;
-  inline TEST_ALWAYS_INLINE std::size_t operator()(uint32_t data) const {
-    return hash_len_0_to_8<4>(reinterpret_cast<const char*>(&data));
-  }
-};
-
-struct UInt64Hash {
-  UInt64Hash() = default;
-  inline TEST_ALWAYS_INLINE std::size_t operator()(uint64_t data) const {
-    return hash_len_0_to_8<8>(reinterpret_cast<const char*>(&data));
-  }
-};
-
-struct UInt128Hash {
-  UInt128Hash() = default;
-  inline TEST_ALWAYS_INLINE std::size_t operator()(__uint128_t data) const {
-    const __uint128_t __mask = static_cast<std::size_t>(-1);
-    const std::size_t __a    = (std::size_t)(data & __mask);
-    const std::size_t __b    = (std::size_t)((data & (__mask << 64)) >> 64);
-    return hash_len_16(__a, rotate_by_at_least_1(__b + 16, 16)) ^ __b;
-  }
-};
-
-struct UInt32Hash2 {
-  UInt32Hash2() = default;
-  inline TEST_ALWAYS_INLINE std::size_t operator()(uint32_t data) const {
-    const uint32_t __m = 0x5bd1e995;
-    const uint32_t __r = 24;
-    uint32_t __h       = 4;
-    uint32_t __k       = data;
-    __k *= __m;
-    __k ^= __k >> __r;
-    __k *= __m;
-    __h *= __m;
-    __h ^= __k;
-    __h ^= __h >> 13;
-    __h *= __m;
-    __h ^= __h >> 15;
-    return __h;
-  }
-};
+// The purpose of this hash function is to NOT be implemented as the identity function,
+// which is how std::hash is implemented for smaller integral types.
+struct NonIdentityScalarHash : std::hash<unsigned long long> {};
 
 // The sole purpose of this comparator is to be used in BM_Rehash, where
 // we need something slow enough to be easily noticable in benchmark results.
@@ -138,7 +72,7 @@ BENCHMARK_CAPTURE(BM_InsertValue,
 
 BENCHMARK_CAPTURE(BM_InsertValueRehash,
                   unordered_set_top_bits_uint32,
-                  std::unordered_set<uint32_t, UInt32Hash>{},
+                  std::unordered_set<uint32_t, NonIdentityScalarHash>{},
                   getSortedTopBitsIntegerInputs<uint32_t>)
     ->Arg(TestNumInputs);
 
@@ -171,7 +105,7 @@ BENCHMARK_CAPTURE(
 
 BENCHMARK_CAPTURE(BM_FindRehash,
                   unordered_set_random_uint64,
-                  std::unordered_set<uint64_t, UInt64Hash>{},
+                  std::unordered_set<uint64_t, NonIdentityScalarHash>{},
                   getRandomIntegerInputs<uint64_t>)
     ->Arg(TestNumInputs);
 
@@ -182,22 +116,24 @@ BENCHMARK_CAPTURE(
 
 BENCHMARK_CAPTURE(BM_FindRehash,
                   unordered_set_sorted_uint64,
-                  std::unordered_set<uint64_t, UInt64Hash>{},
+                  std::unordered_set<uint64_t, NonIdentityScalarHash>{},
                   getSortedIntegerInputs<uint64_t>)
     ->Arg(TestNumInputs);
 
 // Sorted //
+#ifndef TEST_HAS_NO_INT128
 BENCHMARK_CAPTURE(BM_Find,
                   unordered_set_sorted_uint128,
-                  std::unordered_set<__uint128_t, UInt128Hash>{},
+                  std::unordered_set<__uint128_t>{},
                   getSortedTopBitsIntegerInputs<__uint128_t>)
     ->Arg(TestNumInputs);
 
 BENCHMARK_CAPTURE(BM_FindRehash,
                   unordered_set_sorted_uint128,
-                  std::unordered_set<__uint128_t, UInt128Hash>{},
+                  std::unordered_set<__uint128_t>{},
                   getSortedTopBitsIntegerInputs<__uint128_t>)
     ->Arg(TestNumInputs);
+#endif
 
 // Sorted //
 BENCHMARK_CAPTURE(
@@ -206,7 +142,7 @@ BENCHMARK_CAPTURE(
 
 BENCHMARK_CAPTURE(BM_FindRehash,
                   unordered_set_sorted_uint32,
-                  std::unordered_set<uint32_t, UInt32Hash2>{},
+                  std::unordered_set<uint32_t, NonIdentityScalarHash>{},
                   getSortedIntegerInputs<uint32_t>)
     ->Arg(TestNumInputs);
 
@@ -217,7 +153,7 @@ BENCHMARK_CAPTURE(
 
 BENCHMARK_CAPTURE(BM_FindRehash,
                   unordered_set_sorted_large_uint64,
-                  std::unordered_set<uint64_t, UInt64Hash>{},
+                  std::unordered_set<uint64_t, NonIdentityScalarHash>{},
                   getSortedLargeIntegerInputs<uint64_t>)
     ->Arg(TestNumInputs);
 
@@ -228,7 +164,7 @@ BENCHMARK_CAPTURE(
 
 BENCHMARK_CAPTURE(BM_FindRehash,
                   unordered_set_top_bits_uint64,
-                  std::unordered_set<uint64_t, UInt64Hash>{},
+                  std::unordered_set<uint64_t, NonIdentityScalarHash>{},
                   getSortedTopBitsIntegerInputs<uint64_t>)
     ->Arg(TestNumInputs);
 
diff --git a/libcxx/test/benchmarks/variant_visit_1.bench.cpp b/libcxx/test/benchmarks/variant_visit_1.bench.cpp
index 3e6f22e..42b22aa 100644
--- a/libcxx/test/benchmarks/variant_visit_1.bench.cpp
+++ b/libcxx/test/benchmarks/variant_visit_1.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include "benchmark/benchmark.h"
 
 #include "VariantBenchmarks.h"
diff --git a/libcxx/test/benchmarks/variant_visit_2.bench.cpp b/libcxx/test/benchmarks/variant_visit_2.bench.cpp
index 43aba48..328048c 100644
--- a/libcxx/test/benchmarks/variant_visit_2.bench.cpp
+++ b/libcxx/test/benchmarks/variant_visit_2.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include "benchmark/benchmark.h"
 
 #include "VariantBenchmarks.h"
diff --git a/libcxx/test/benchmarks/variant_visit_3.bench.cpp b/libcxx/test/benchmarks/variant_visit_3.bench.cpp
index 8569942..40f8c1b 100644
--- a/libcxx/test/benchmarks/variant_visit_3.bench.cpp
+++ b/libcxx/test/benchmarks/variant_visit_3.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14
+
 #include "benchmark/benchmark.h"
 
 #include "VariantBenchmarks.h"
diff --git a/libcxx/test/benchmarks/vector_operations.bench.cpp b/libcxx/test/benchmarks/vector_operations.bench.cpp
index 8698e45..b0dffe3 100644
--- a/libcxx/test/benchmarks/vector_operations.bench.cpp
+++ b/libcxx/test/benchmarks/vector_operations.bench.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
diff --git a/libcxx/test/std/containers/sequences/vector/vector.cons/exceptions.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.cons/exceptions.pass.cpp
index c9c1bac..e2b0d69 100644
--- a/libcxx/test/std/containers/sequences/vector/vector.cons/exceptions.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector/vector.cons/exceptions.pass.cpp
@@ -49,13 +49,14 @@ struct ThrowingT {
     --throw_after_n;
   }
 
-  ThrowingT(const ThrowingT&) {
+  ThrowingT(const ThrowingT& rhs) : throw_after_n_(rhs.throw_after_n_) {
     if (throw_after_n_ == nullptr || *throw_after_n_ == 0)
       throw 1;
     --*throw_after_n_;
   }
 
-  ThrowingT& operator=(const ThrowingT&) {
+  ThrowingT& operator=(const ThrowingT& rhs) {
+    throw_after_n_ = rhs.throw_after_n_;
     if (throw_after_n_ == nullptr || *throw_after_n_ == 0)
       throw 1;
     --*throw_after_n_;
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.compile.pass.cpp
index 315058f..0094bed 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.compile.pass.cpp
@@ -390,17 +390,11 @@
 #   error "__cpp_lib_shared_ptr_weak_type should have the value 201606L in c++20"
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_smart_ptr_for_overwrite
-#     error "__cpp_lib_smart_ptr_for_overwrite should be defined in c++20"
-#   endif
-#   if __cpp_lib_smart_ptr_for_overwrite != 202002L
-#     error "__cpp_lib_smart_ptr_for_overwrite should have the value 202002L in c++20"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_smart_ptr_for_overwrite
-#     error "__cpp_lib_smart_ptr_for_overwrite should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifndef __cpp_lib_smart_ptr_for_overwrite
+#   error "__cpp_lib_smart_ptr_for_overwrite should be defined in c++20"
+# endif
+# if __cpp_lib_smart_ptr_for_overwrite != 202002L
+#   error "__cpp_lib_smart_ptr_for_overwrite should have the value 202002L in c++20"
 # endif
 
 # ifdef __cpp_lib_smart_ptr_owner_equality
@@ -521,17 +515,11 @@
 #   error "__cpp_lib_shared_ptr_weak_type should have the value 201606L in c++23"
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_smart_ptr_for_overwrite
-#     error "__cpp_lib_smart_ptr_for_overwrite should be defined in c++23"
-#   endif
-#   if __cpp_lib_smart_ptr_for_overwrite != 202002L
-#     error "__cpp_lib_smart_ptr_for_overwrite should have the value 202002L in c++23"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_smart_ptr_for_overwrite
-#     error "__cpp_lib_smart_ptr_for_overwrite should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifndef __cpp_lib_smart_ptr_for_overwrite
+#   error "__cpp_lib_smart_ptr_for_overwrite should be defined in c++23"
+# endif
+# if __cpp_lib_smart_ptr_for_overwrite != 202002L
+#   error "__cpp_lib_smart_ptr_for_overwrite should have the value 202002L in c++23"
 # endif
 
 # ifdef __cpp_lib_smart_ptr_owner_equality
@@ -652,17 +640,11 @@
 #   error "__cpp_lib_shared_ptr_weak_type should have the value 201606L in c++26"
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_smart_ptr_for_overwrite
-#     error "__cpp_lib_smart_ptr_for_overwrite should be defined in c++26"
-#   endif
-#   if __cpp_lib_smart_ptr_for_overwrite != 202002L
-#     error "__cpp_lib_smart_ptr_for_overwrite should have the value 202002L in c++26"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_smart_ptr_for_overwrite
-#     error "__cpp_lib_smart_ptr_for_overwrite should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifndef __cpp_lib_smart_ptr_for_overwrite
+#   error "__cpp_lib_smart_ptr_for_overwrite should be defined in c++26"
+# endif
+# if __cpp_lib_smart_ptr_for_overwrite != 202002L
+#   error "__cpp_lib_smart_ptr_for_overwrite should have the value 202002L in c++26"
 # endif
 
 # if !defined(_LIBCPP_VERSION)
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index 5deaee1..74db80c 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -4344,17 +4344,11 @@
 #   error "__cpp_lib_shift should have the value 201806L in c++20"
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_smart_ptr_for_overwrite
-#     error "__cpp_lib_smart_ptr_for_overwrite should be defined in c++20"
-#   endif
-#   if __cpp_lib_smart_ptr_for_overwrite != 202002L
-#     error "__cpp_lib_smart_ptr_for_overwrite should have the value 202002L in c++20"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_smart_ptr_for_overwrite
-#     error "__cpp_lib_smart_ptr_for_overwrite should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifndef __cpp_lib_smart_ptr_for_overwrite
+#   error "__cpp_lib_smart_ptr_for_overwrite should be defined in c++20"
+# endif
+# if __cpp_lib_smart_ptr_for_overwrite != 202002L
+#   error "__cpp_lib_smart_ptr_for_overwrite should have the value 202002L in c++20"
 # endif
 
 # ifdef __cpp_lib_smart_ptr_owner_equality
@@ -5929,17 +5923,11 @@
 #   error "__cpp_lib_shift should have the value 201806L in c++23"
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_smart_ptr_for_overwrite
-#     error "__cpp_lib_smart_ptr_for_overwrite should be defined in c++23"
-#   endif
-#   if __cpp_lib_smart_ptr_for_overwrite != 202002L
-#     error "__cpp_lib_smart_ptr_for_overwrite should have the value 202002L in c++23"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_smart_ptr_for_overwrite
-#     error "__cpp_lib_smart_ptr_for_overwrite should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifndef __cpp_lib_smart_ptr_for_overwrite
+#   error "__cpp_lib_smart_ptr_for_overwrite should be defined in c++23"
+# endif
+# if __cpp_lib_smart_ptr_for_overwrite != 202002L
+#   error "__cpp_lib_smart_ptr_for_overwrite should have the value 202002L in c++23"
 # endif
 
 # ifdef __cpp_lib_smart_ptr_owner_equality
@@ -7829,17 +7817,11 @@
 #   error "__cpp_lib_shift should have the value 201806L in c++26"
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_smart_ptr_for_overwrite
-#     error "__cpp_lib_smart_ptr_for_overwrite should be defined in c++26"
-#   endif
-#   if __cpp_lib_smart_ptr_for_overwrite != 202002L
-#     error "__cpp_lib_smart_ptr_for_overwrite should have the value 202002L in c++26"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_smart_ptr_for_overwrite
-#     error "__cpp_lib_smart_ptr_for_overwrite should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifndef __cpp_lib_smart_ptr_for_overwrite
+#   error "__cpp_lib_smart_ptr_for_overwrite should be defined in c++26"
+# endif
+# if __cpp_lib_smart_ptr_for_overwrite != 202002L
+#   error "__cpp_lib_smart_ptr_for_overwrite should have the value 202002L in c++26"
 # endif
 
 # if !defined(_LIBCPP_VERSION)
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index 197d6bb..c32d8d7 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -1196,7 +1196,6 @@ feature_test_macros = [
             "name": "__cpp_lib_smart_ptr_for_overwrite",
             "values": {"c++20": 202002},
             "headers": ["memory"],
-            "unimplemented": True,
         },
         {
             "name": "__cpp_lib_smart_ptr_owner_equality",
diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp
index c666a75..bb0a787 100644
--- a/lldb/source/Core/Debugger.cpp
+++ b/lldb/source/Core/Debugger.cpp
@@ -2116,7 +2116,7 @@ void Debugger::HandleProgressEvent(const lldb::EventSP &event_sp) {
   const uint32_t term_width = GetTerminalWidth();
   const uint32_t ellipsis = 3;
   if (message.size() + ellipsis >= term_width)
-    message = message.substr(0, term_width - ellipsis);
+    message.resize(term_width - ellipsis);
 
   const bool use_color = GetUseColor();
   llvm::StringRef ansi_prefix = GetShowProgressAnsiPrefix();
diff --git a/lldb/source/ValueObject/ValueObject.cpp b/lldb/source/ValueObject/ValueObject.cpp
index aca4317..4006f6e 100644
--- a/lldb/source/ValueObject/ValueObject.cpp
+++ b/lldb/source/ValueObject/ValueObject.cpp
@@ -2907,15 +2907,6 @@ ValueObjectSP ValueObject::AddressOf(Status &error) {
 
   AddressType address_type = eAddressTypeInvalid;
   const bool scalar_is_load_address = false;
-
-  // For reference type we need to get the address of the object that
-  // it refers to.
-  if (GetCompilerType().IsReferenceType()) {
-    ValueObjectSP deref_obj = Dereference(error);
-    if (error.Fail() || !deref_obj)
-      return ValueObjectSP();
-    return deref_obj->AddressOf(error);
-  }
   addr_t addr = GetAddressOf(scalar_is_load_address, &address_type);
   error.Clear();
   if (addr != LLDB_INVALID_ADDRESS && address_type != eAddressTypeHost) {
diff --git a/lldb/test/API/lang/cpp/dereferencing_references/TestCPPDereferencingReferences.py b/lldb/test/API/lang/cpp/dereferencing_references/TestCPPDereferencingReferences.py
index 1374d4e..938fb1a6 100644
--- a/lldb/test/API/lang/cpp/dereferencing_references/TestCPPDereferencingReferences.py
+++ b/lldb/test/API/lang/cpp/dereferencing_references/TestCPPDereferencingReferences.py
@@ -25,24 +25,3 @@ class TestCase(TestBase):
         # Typedef to a reference should dereference to the underlying type.
         td_val = self.expect_var_path("td_to_ref_type", type="td_int_ref")
         self.assertEqual(td_val.Dereference().GetType().GetName(), "int")
-
-    def test_take_address_of_reference(self):
-        """Tests taking address of lvalue/rvalue references in lldb works correctly."""
-        self.build()
-        lldbutil.run_to_source_breakpoint(
-            self, "// break here", lldb.SBFileSpec("main.cpp")
-        )
-
-        plref_val_from_code = self.expect_var_path("pl_ref", type="TTT *")
-        plref_val_from_expr_path = self.expect_var_path("&l_ref", type="TTT *")
-        self.assertEqual(
-            plref_val_from_code.GetValueAsAddress(),
-            plref_val_from_expr_path.GetValueAsAddress(),
-        )
-
-        prref_val_from_code = self.expect_var_path("pr_ref", type="TTT *")
-        prref_val_from_expr_path = self.expect_var_path("&r_ref", type="TTT *")
-        self.assertEqual(
-            prref_val_from_code.GetValueAsAddress(),
-            prref_val_from_expr_path.GetValueAsAddress(),
-        )
diff --git a/lldb/test/API/lang/cpp/dereferencing_references/main.cpp b/lldb/test/API/lang/cpp/dereferencing_references/main.cpp
index 4ddffd1..b64978a 100644
--- a/lldb/test/API/lang/cpp/dereferencing_references/main.cpp
+++ b/lldb/test/API/lang/cpp/dereferencing_references/main.cpp
@@ -9,7 +9,5 @@ int main() {
   // typedef of a reference
   td_int_ref td_to_ref_type = i;
 
-  TTT *pl_ref = &l_ref;
-  TTT *pr_ref = &r_ref;
   return l_ref; // break here
 }
diff --git a/llvm/bindings/ocaml/llvm/llvm.mli b/llvm/bindings/ocaml/llvm/llvm.mli
index 076e651..17cad1f 100644
--- a/llvm/bindings/ocaml/llvm/llvm.mli
+++ b/llvm/bindings/ocaml/llvm/llvm.mli
@@ -696,7 +696,7 @@ val named_struct_type : llcontext -> string -> lltype
 
 (** [struct_set_body ty elts ispacked] sets the body of the named struct [ty]
     to the [elts] elements.
-    See the moethd [llvm::StructType::setBody]. *)
+    See the method [llvm::StructType::setBody]. *)
 val struct_set_body : lltype -> lltype array -> bool -> unit
 
 (** [struct_element_types sty] returns the constituent types of the struct type
diff --git a/llvm/docs/Contributing.rst b/llvm/docs/Contributing.rst
index 2f8ce7b..fd7c0b5 100644
--- a/llvm/docs/Contributing.rst
+++ b/llvm/docs/Contributing.rst
@@ -181,8 +181,8 @@ of LLVM's high-level design, as well as its internals:
 .. _irc.oftc.net: irc://irc.oftc.net/llvm
 .. _good first issue: https://github.com/llvm/llvm-project/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22
 .. _bug tracker: https://github.com/llvm/llvm-project/issues
-.. _clang-format-diff.py: https://reviews.llvm.org/source/llvm-github/browse/main/clang/tools/clang-format/clang-format-diff.py
-.. _git-clang-format: https://reviews.llvm.org/source/llvm-github/browse/main/clang/tools/clang-format/git-clang-format
+.. _clang-format-diff.py: https://github.com/llvm/llvm-project/blob/main/clang/tools/clang-format/clang-format-diff.py
+.. _git-clang-format: https://github.com/llvm/llvm-project/blob/main/clang/tools/clang-format/git-clang-format
 .. _LLVM's GitHub: https://github.com/llvm/llvm-project
 .. _LLVM's Phabricator (read-only): https://reviews.llvm.org/
 .. _LLVM's Open Projects page: https://llvm.org/OpenProjects.html#what
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index bbd9667..765cc32 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -4386,7 +4386,7 @@ is defined inline with other types (e.g. ``[2 x {i32, i32}]``) whereas
 identified types are always defined at the top level with a name.
 Literal types are uniqued by their contents and can never be recursive
 or opaque since there is no way to write one. Identified types can be
-recursive, can be opaqued, and are never uniqued.
+opaqued and are never uniqued. Identified types must not be recursive.
 
 :Syntax:
 
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 2e371b2..290473c 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -56,6 +56,8 @@ Makes programs 10x faster by doing Special New Thing.
 Changes to the LLVM IR
 ----------------------
 
+* Types are no longer allowed to be recursive.
+
 * The `x86_mmx` IR type has been removed. It will be translated to
   the standard vector type `<1 x i64>` in bitcode upgrade.
 * Renamed `llvm.experimental.stepvector` intrinsic to `llvm.stepvector`.
diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h
index 820b5c0..eb0e5ab 100644
--- a/llvm/include/llvm/IR/DerivedTypes.h
+++ b/llvm/include/llvm/IR/DerivedTypes.h
@@ -33,6 +33,7 @@ class Value;
 class APInt;
 class LLVMContext;
 template <typename T> class Expected;
+class Error;
 
 /// Class to represent integer types. Note that this class is also used to
 /// represent the built-in integer types: Int1Ty, Int8Ty, Int16Ty, Int32Ty and
@@ -317,9 +318,18 @@ public:
   /// suffix if there is a collision. Do not call this on an literal type.
   void setName(StringRef Name);
 
-  /// Specify a body for an opaque identified type.
+  /// Specify a body for an opaque identified type, which must not make the type
+  /// recursive.
   void setBody(ArrayRef<Type*> Elements, bool isPacked = false);
 
+  /// Specify a body for an opaque identified type or return an error if it
+  /// would make the type recursive.
+  Error setBodyOrError(ArrayRef<Type *> Elements, bool isPacked = false);
+
+  /// Return an error if the body for an opaque identified type would make it
+  /// recursive.
+  Error checkBody(ArrayRef<Type *> Elements);
+
   template <typename... Tys>
   std::enable_if_t<are_base_of<Type, Tys...>::value, void>
   setBody(Type *elt1, Tys *... elts) {
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index 43bf36d..e7afcbd 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -942,9 +942,14 @@ public:
   ///
   void viewCFG() const;
 
+  /// viewCFG - This function is meant for use from the debugger. It works just
+  /// like viewCFG(), but generates the dot file with the given file name.
+  void viewCFG(const char *OutputFileName) const;
+
   /// Extended form to print edge weights.
   void viewCFG(bool ViewCFGOnly, const BlockFrequencyInfo *BFI,
-               const BranchProbabilityInfo *BPI) const;
+               const BranchProbabilityInfo *BPI,
+               const char *OutputFileName = nullptr) const;
 
   /// viewCFGOnly - This function is meant for use from the debugger.  It works
   /// just like viewCFG, but it does not include the contents of basic blocks
@@ -953,6 +958,10 @@ public:
   ///
   void viewCFGOnly() const;
 
+  /// viewCFG - This function is meant for use from the debugger. It works just
+  /// like viewCFGOnly(), but generates the dot file with the given file name.
+  void viewCFGOnly(const char *OutputFileName) const;
+
   /// Extended form to print edge weights.
   void viewCFGOnly(const BlockFrequencyInfo *BFI,
                    const BranchProbabilityInfo *BPI) const;
diff --git a/llvm/lib/Analysis/CFGPrinter.cpp b/llvm/lib/Analysis/CFGPrinter.cpp
index 67a1519..af18fb6 100644
--- a/llvm/lib/Analysis/CFGPrinter.cpp
+++ b/llvm/lib/Analysis/CFGPrinter.cpp
@@ -136,12 +136,18 @@ PreservedAnalyses CFGOnlyPrinterPass::run(Function &F,
 ///
 void Function::viewCFG() const { viewCFG(false, nullptr, nullptr); }
 
+void Function::viewCFG(const char *OutputFileName) const {
+  viewCFG(false, nullptr, nullptr, OutputFileName);
+}
+
 void Function::viewCFG(bool ViewCFGOnly, const BlockFrequencyInfo *BFI,
-                       const BranchProbabilityInfo *BPI) const {
+                       const BranchProbabilityInfo *BPI,
+                       const char *OutputFileName) const {
   if (!CFGFuncName.empty() && !getName().contains(CFGFuncName))
     return;
   DOTFuncInfo CFGInfo(this, BFI, BPI, BFI ? getMaxFreq(*this, BFI) : 0);
-  ViewGraph(&CFGInfo, "cfg" + getName(), ViewCFGOnly);
+  ViewGraph(&CFGInfo, OutputFileName ? OutputFileName : "cfg" + getName(),
+            ViewCFGOnly);
 }
 
 /// viewCFGOnly - This function is meant for use from the debugger.  It works
@@ -151,6 +157,10 @@ void Function::viewCFG(bool ViewCFGOnly, const BlockFrequencyInfo *BFI,
 ///
 void Function::viewCFGOnly() const { viewCFGOnly(nullptr, nullptr); }
 
+void Function::viewCFGOnly(const char *OutputFileName) const {
+  viewCFG(true, nullptr, nullptr, OutputFileName);
+}
+
 void Function::viewCFGOnly(const BlockFrequencyInfo *BFI,
                            const BranchProbabilityInfo *BPI) const {
   viewCFG(true, BFI, BPI);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 8ddb2ef..6ad8d21 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -3391,7 +3391,9 @@ bool LLParser::parseStructDefinition(SMLoc TypeLoc, StringRef Name,
       (isPacked && parseToken(lltok::greater, "expected '>' in packed struct")))
     return true;
 
-  STy->setBody(Body, isPacked);
+  if (auto E = STy->setBodyOrError(Body, isPacked))
+    return tokError(toString(std::move(E)));
+
   ResultTy = STy;
   return false;
 }
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 446c98c..3e82aa7 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2659,7 +2659,8 @@ Error BitcodeReader::parseTypeTableBody() {
       }
       if (EltTys.size() != Record.size()-1)
         return error("Invalid named struct record");
-      Res->setBody(EltTys, Record[0]);
+      if (auto E = Res->setBodyOrError(EltTys, Record[0]))
+        return E;
       ContainedIDs.append(Record.begin() + 1, Record.end());
       ResultTy = Res;
       break;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 9a1aa27..ede8d82 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1049,7 +1049,7 @@ bool CombinerHelper::matchSextInRegOfLoad(
 
   Register SrcReg = MI.getOperand(1).getReg();
   auto *LoadDef = getOpcodeDef<GLoad>(SrcReg, MRI);
-  if (!LoadDef || !MRI.hasOneNonDBGUse(DstReg))
+  if (!LoadDef || !MRI.hasOneNonDBGUse(SrcReg))
     return false;
 
   uint64_t MemBits = LoadDef->getMemSizeInBits().getValue();
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index c3c581d..7b630e8 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -385,6 +385,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
         continue;
       MI->removeOperand(i-1);
     }
+    MI->dropMemRefs(*MI->getMF());
     LLVM_DEBUG(dbgs() << "Converted physregs to:\t" << *MI);
   } else {
     // If the dest of MI is an original reg and MI is reMaterializable,
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 7ea0786..da095c6 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1371,7 +1371,8 @@ bool MachineLICMImpl::IsProfitableToHoist(MachineInstr &MI,
                }) &&
         IsLoopInvariantInst(MI, CurLoop) &&
         any_of(MRI->use_nodbg_instructions(DefReg),
-               [&CurLoop, this, DefReg, Cost](MachineInstr &UseMI) {
+               [&CurLoop, this, DefReg,
+                Cost = std::move(Cost)](MachineInstr &UseMI) {
                  if (!CurLoop->contains(&UseMI))
                    return false;
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 42232bd..dcd5ca3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7392,6 +7392,16 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
       return DAG.getNode(ISD::AND, DL, VT, X,
                          DAG.getNOT(DL, DAG.getNode(Opc, DL, VT, Y, Z), VT));
 
+  // Fold (and (srl X, C), 1) -> (srl X, BW-1) for signbit extraction
+  // If we are shifting down an extended sign bit, see if we can simplify
+  // this to shifting the MSB directly to expose further simplifications.
+  // This pattern often appears after sext_inreg legalization.
+  APInt Amt;
+  if (sd_match(N, m_And(m_Srl(m_Value(X), m_ConstInt(Amt)), m_One())) &&
+      Amt.ult(BitWidth - 1) && Amt.uge(BitWidth - DAG.ComputeNumSignBits(X)))
+    return DAG.getNode(ISD::SRL, DL, VT, X,
+                       DAG.getShiftAmountConstant(BitWidth - 1, VT, DL));
+
   // Masking the negated extension of a boolean is just the zero-extended
   // boolean:
   // and (sub 0, zext(bool X)), 1 --> zext(bool X)
@@ -7399,16 +7409,13 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   //
   // Note: the SimplifyDemandedBits fold below can make an information-losing
   // transform, and then we have no way to find this better fold.
-  if (N1C && N1C->isOne() && N0.getOpcode() == ISD::SUB) {
-    if (isNullOrNullSplat(N0.getOperand(0))) {
-      SDValue SubRHS = N0.getOperand(1);
-      if (SubRHS.getOpcode() == ISD::ZERO_EXTEND &&
-          SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
-        return SubRHS;
-      if (SubRHS.getOpcode() == ISD::SIGN_EXTEND &&
-          SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
-        return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, SubRHS.getOperand(0));
-    }
+  if (sd_match(N, m_And(m_Sub(m_Zero(), m_Value(X)), m_One()))) {
+    if (X.getOpcode() == ISD::ZERO_EXTEND &&
+        X.getOperand(0).getScalarValueSizeInBits() == 1)
+      return X;
+    if (X.getOpcode() == ISD::SIGN_EXTEND &&
+        X.getOperand(0).getScalarValueSizeInBits() == 1)
+      return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, X.getOperand(0));
   }
 
   // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a16ec19..0360c1b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -797,6 +797,16 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
       return Op.getOperand(1);
     break;
   }
+  case ISD::ADD: {
+    RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    if (RHSKnown.isZero())
+      return Op.getOperand(0);
+
+    LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    if (LHSKnown.isZero())
+      return Op.getOperand(1);
+    break;
+  }
   case ISD::SHL: {
     // If we are only demanding sign bits then we can use the shift source
     // directly.
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
index 3130a0b..459b4d2 100644
--- a/llvm/lib/IR/Intrinsics.cpp
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -32,6 +32,7 @@
 #include "llvm/IR/Type.h"
 
 using namespace llvm;
+using namespace Intrinsic;
 
 /// Table of string intrinsic names indexed by enum value.
 static constexpr const char *const IntrinsicNameTable[] = {
@@ -48,7 +49,7 @@ StringRef Intrinsic::getBaseName(ID id) {
 
 StringRef Intrinsic::getName(ID id) {
   assert(id < num_intrinsics && "Invalid intrinsic ID!");
-  assert(!Intrinsic::isOverloaded(id) &&
+  assert(!isOverloaded(id) &&
          "This version of getName does not support overloading");
   return getBaseName(id);
 }
@@ -151,27 +152,27 @@ static std::string getMangledTypeStr(Type *Ty, bool &HasUnnamedType) {
   return Result;
 }
 
-static std::string getIntrinsicNameImpl(Intrinsic::ID Id, ArrayRef<Type *> Tys,
-                                        Module *M, FunctionType *FT,
+static std::string getIntrinsicNameImpl(ID Id, ArrayRef<Type *> Tys, Module *M,
+                                        FunctionType *FT,
                                         bool EarlyModuleCheck) {
 
-  assert(Id < Intrinsic::num_intrinsics && "Invalid intrinsic ID!");
-  assert((Tys.empty() || Intrinsic::isOverloaded(Id)) &&
+  assert(Id < num_intrinsics && "Invalid intrinsic ID!");
+  assert((Tys.empty() || isOverloaded(Id)) &&
          "This version of getName is for overloaded intrinsics only");
   (void)EarlyModuleCheck;
   assert((!EarlyModuleCheck || M ||
           !any_of(Tys, [](Type *T) { return isa<PointerType>(T); })) &&
          "Intrinsic overloading on pointer types need to provide a Module");
   bool HasUnnamedType = false;
-  std::string Result(Intrinsic::getBaseName(Id));
+  std::string Result(getBaseName(Id));
   for (Type *Ty : Tys)
     Result += "." + getMangledTypeStr(Ty, HasUnnamedType);
   if (HasUnnamedType) {
     assert(M && "unnamed types need a module");
     if (!FT)
-      FT = Intrinsic::getType(M->getContext(), Id, Tys);
+      FT = getType(M->getContext(), Id, Tys);
     else
-      assert((FT == Intrinsic::getType(M->getContext(), Id, Tys)) &&
+      assert((FT == getType(M->getContext(), Id, Tys)) &&
              "Provided FunctionType must match arguments");
     return M->getUniqueIntrinsicName(Result, Id, FT);
   }
@@ -198,13 +199,10 @@ enum IIT_Info {
 #undef GET_INTRINSIC_IITINFO
 };
 
-static void
-DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
-              IIT_Info LastInfo,
-              SmallVectorImpl<Intrinsic::IITDescriptor> &OutputTable) {
-  using namespace Intrinsic;
-
-  bool IsScalableVector = (LastInfo == IIT_SCALABLE_VEC);
+static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
+                          IIT_Info LastInfo,
+                          SmallVectorImpl<IITDescriptor> &OutputTable) {
+  bool IsScalableVector = LastInfo == IIT_SCALABLE_VEC;
 
   IIT_Info Info = IIT_Info(Infos[NextElt++]);
   unsigned StructElts = 2;
@@ -481,10 +479,8 @@ void Intrinsic::getIntrinsicInfoTableEntries(
     DecodeIITType(NextElt, IITEntries, IIT_Done, T);
 }
 
-static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
+static Type *DecodeFixedType(ArrayRef<IITDescriptor> &Infos,
                              ArrayRef<Type *> Tys, LLVMContext &Context) {
-  using namespace Intrinsic;
-
   IITDescriptor D = Infos.front();
   Infos = Infos.slice(1);
 
@@ -617,13 +613,10 @@ bool Intrinsic::isOverloaded(ID id) {
 #include "llvm/IR/IntrinsicImpl.inc"
 #undef GET_INTRINSIC_TARGET_DATA
 
-bool Intrinsic::isTargetIntrinsic(Intrinsic::ID IID) {
-  return IID > TargetInfos[0].Count;
-}
+bool Intrinsic::isTargetIntrinsic(ID IID) { return IID > TargetInfos[0].Count; }
 
-int llvm::Intrinsic::lookupLLVMIntrinsicByName(ArrayRef<const char *> NameTable,
-                                               StringRef Name,
-                                               StringRef Target) {
+int Intrinsic::lookupLLVMIntrinsicByName(ArrayRef<const char *> NameTable,
+                                         StringRef Name, StringRef Target) {
   assert(Name.starts_with("llvm.") && "Unexpected intrinsic prefix");
   assert(Name.drop_front(5).starts_with(Target) && "Unexpected target");
 
@@ -685,24 +678,23 @@ findTargetSubtable(StringRef Name) {
 
 /// This does the actual lookup of an intrinsic ID which matches the given
 /// function name.
-Intrinsic::ID Intrinsic::lookupIntrinsicID(StringRef Name) {
+ID Intrinsic::lookupIntrinsicID(StringRef Name) {
   auto [NameTable, Target] = findTargetSubtable(Name);
-  int Idx = Intrinsic::lookupLLVMIntrinsicByName(NameTable, Name, Target);
+  int Idx = lookupLLVMIntrinsicByName(NameTable, Name, Target);
   if (Idx == -1)
-    return Intrinsic::not_intrinsic;
+    return not_intrinsic;
 
   // Intrinsic IDs correspond to the location in IntrinsicNameTable, but we have
   // an index into a sub-table.
   int Adjust = NameTable.data() - IntrinsicNameTable;
-  Intrinsic::ID ID = static_cast<Intrinsic::ID>(Idx + Adjust);
+  ID Id = static_cast<ID>(Idx + Adjust);
 
   // If the intrinsic is not overloaded, require an exact match. If it is
   // overloaded, require either exact or prefix match.
   const auto MatchSize = strlen(NameTable[Idx]);
   assert(Name.size() >= MatchSize && "Expected either exact or prefix match");
   bool IsExactMatch = Name.size() == MatchSize;
-  return IsExactMatch || Intrinsic::isOverloaded(ID) ? ID
-                                                     : Intrinsic::not_intrinsic;
+  return IsExactMatch || isOverloaded(Id) ? Id : not_intrinsic;
 }
 
 /// This defines the "Intrinsic::getAttributes(ID id)" method.
@@ -743,8 +735,7 @@ Function *Intrinsic::getDeclarationIfExists(Module *M, ID id,
 
 bool Intrinsic::isConstrainedFPIntrinsic(ID QID) {
   switch (QID) {
-#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
-  case Intrinsic::INTRINSIC:
+#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) case INTRINSIC:
 #include "llvm/IR/ConstrainedOps.def"
 #undef INSTRUCTION
     return true;
@@ -753,10 +744,10 @@ bool Intrinsic::isConstrainedFPIntrinsic(ID QID) {
   }
 }
 
-bool Intrinsic::hasConstrainedFPRoundingModeOperand(Intrinsic::ID QID) {
+bool Intrinsic::hasConstrainedFPRoundingModeOperand(ID QID) {
   switch (QID) {
 #define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
-  case Intrinsic::INTRINSIC:                                                   \
+  case INTRINSIC:                                                              \
     return ROUND_MODE == 1;
 #include "llvm/IR/ConstrainedOps.def"
 #undef INSTRUCTION
@@ -765,16 +756,13 @@ bool Intrinsic::hasConstrainedFPRoundingModeOperand(Intrinsic::ID QID) {
   }
 }
 
-using DeferredIntrinsicMatchPair =
-    std::pair<Type *, ArrayRef<Intrinsic::IITDescriptor>>;
+using DeferredIntrinsicMatchPair = std::pair<Type *, ArrayRef<IITDescriptor>>;
 
 static bool
-matchIntrinsicType(Type *Ty, ArrayRef<Intrinsic::IITDescriptor> &Infos,
+matchIntrinsicType(Type *Ty, ArrayRef<IITDescriptor> &Infos,
                    SmallVectorImpl<Type *> &ArgTys,
                    SmallVectorImpl<DeferredIntrinsicMatchPair> &DeferredChecks,
                    bool IsDeferredCheck) {
-  using namespace Intrinsic;
-
   // If we ran out of descriptors, there are too many arguments.
   if (Infos.empty())
     return true;
@@ -993,9 +981,9 @@ matchIntrinsicType(Type *Ty, ArrayRef<Intrinsic::IITDescriptor> &Infos,
   llvm_unreachable("unhandled");
 }
 
-Intrinsic::MatchIntrinsicTypesResult
+MatchIntrinsicTypesResult
 Intrinsic::matchIntrinsicSignature(FunctionType *FTy,
-                                   ArrayRef<Intrinsic::IITDescriptor> &Infos,
+                                   ArrayRef<IITDescriptor> &Infos,
                                    SmallVectorImpl<Type *> &ArgTys) {
   SmallVector<DeferredIntrinsicMatchPair, 2> DeferredChecks;
   if (matchIntrinsicType(FTy->getReturnType(), Infos, ArgTys, DeferredChecks,
@@ -1019,8 +1007,8 @@ Intrinsic::matchIntrinsicSignature(FunctionType *FTy,
   return MatchIntrinsicTypes_Match;
 }
 
-bool Intrinsic::matchIntrinsicVarArg(
-    bool isVarArg, ArrayRef<Intrinsic::IITDescriptor> &Infos) {
+bool Intrinsic::matchIntrinsicVarArg(bool isVarArg,
+                                     ArrayRef<IITDescriptor> &Infos) {
   // If there are no descriptors left, then it can't be a vararg.
   if (Infos.empty())
     return isVarArg;
@@ -1038,20 +1026,20 @@ bool Intrinsic::matchIntrinsicVarArg(
   return true;
 }
 
-bool Intrinsic::getIntrinsicSignature(Intrinsic::ID ID, FunctionType *FT,
+bool Intrinsic::getIntrinsicSignature(ID ID, FunctionType *FT,
                                       SmallVectorImpl<Type *> &ArgTys) {
   if (!ID)
     return false;
 
-  SmallVector<Intrinsic::IITDescriptor, 8> Table;
+  SmallVector<IITDescriptor, 8> Table;
   getIntrinsicInfoTableEntries(ID, Table);
-  ArrayRef<Intrinsic::IITDescriptor> TableRef = Table;
+  ArrayRef<IITDescriptor> TableRef = Table;
 
-  if (Intrinsic::matchIntrinsicSignature(FT, TableRef, ArgTys) !=
-      Intrinsic::MatchIntrinsicTypesResult::MatchIntrinsicTypes_Match) {
+  if (matchIntrinsicSignature(FT, TableRef, ArgTys) !=
+      MatchIntrinsicTypesResult::MatchIntrinsicTypes_Match) {
     return false;
   }
-  if (Intrinsic::matchIntrinsicVarArg(FT->isVarArg(), TableRef))
+  if (matchIntrinsicVarArg(FT->isVarArg(), TableRef))
     return false;
   return true;
 }
@@ -1067,10 +1055,10 @@ std::optional<Function *> Intrinsic::remangleIntrinsicFunction(Function *F) {
   if (!getIntrinsicSignature(F, ArgTys))
     return std::nullopt;
 
-  Intrinsic::ID ID = F->getIntrinsicID();
+  ID ID = F->getIntrinsicID();
   StringRef Name = F->getName();
   std::string WantedName =
-      Intrinsic::getName(ID, ArgTys, F->getParent(), F->getFunctionType());
+      getName(ID, ArgTys, F->getParent(), F->getFunctionType());
   if (Name == WantedName)
     return std::nullopt;
 
@@ -1086,7 +1074,7 @@ std::optional<Function *> Intrinsic::remangleIntrinsicFunction(Function *F) {
       // invalid and we'll get an error.
       ExistingGV->setName(WantedName + ".renamed");
     }
-    return Intrinsic::getOrInsertDeclaration(F->getParent(), ID, ArgTys);
+    return getOrInsertDeclaration(F->getParent(), ID, ArgTys);
   }();
 
   NewDecl->setCallingConv(F->getCallingConv());
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index e311cde..88ede0d 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -13,6 +13,7 @@
 #include "llvm/IR/Type.h"
 #include "LLVMContextImpl.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
@@ -436,20 +437,37 @@ bool StructType::containsHomogeneousTypes() const {
 }
 
 void StructType::setBody(ArrayRef<Type*> Elements, bool isPacked) {
+  cantFail(setBodyOrError(Elements, isPacked));
+}
+
+Error StructType::setBodyOrError(ArrayRef<Type *> Elements, bool isPacked) {
   assert(isOpaque() && "Struct body already set!");
 
+  if (auto E = checkBody(Elements))
+    return E;
+
   setSubclassData(getSubclassData() | SCDB_HasBody);
   if (isPacked)
     setSubclassData(getSubclassData() | SCDB_Packed);
 
   NumContainedTys = Elements.size();
+  ContainedTys = Elements.empty()
+                     ? nullptr
+                     : Elements.copy(getContext().pImpl->Alloc).data();
 
-  if (Elements.empty()) {
-    ContainedTys = nullptr;
-    return;
-  }
+  return Error::success();
+}
 
-  ContainedTys = Elements.copy(getContext().pImpl->Alloc).data();
+Error StructType::checkBody(ArrayRef<Type *> Elements) {
+  SmallSetVector<Type *, 4> Worklist(Elements.begin(), Elements.end());
+  for (unsigned I = 0; I < Worklist.size(); ++I) {
+    Type *Ty = Worklist[I];
+    if (Ty == this)
+      return createStringError(Twine("identified structure type '") +
+                               getName() + "' is recursive");
+    Worklist.insert(Ty->subtype_begin(), Ty->subtype_end());
+  }
+  return Error::success();
 }
 
 void StructType::setName(StringRef Name) {
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index 5067fbf..0d54c53 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -34,6 +34,12 @@
 #include <utility>
 using namespace llvm;
 
+/// Most of the errors produced by this module are inconvertible StringErrors.
+/// This convenience function lets us return one of those more easily.
+static Error stringErr(const Twine &T) {
+  return make_error<StringError>(T, inconvertibleErrorCode());
+}
+
 //===----------------------------------------------------------------------===//
 // TypeMap implementation.
 //===----------------------------------------------------------------------===//
@@ -69,7 +75,7 @@ public:
 
   /// Produce a body for an opaque type in the dest module from a type
   /// definition in the source module.
-  void linkDefinedTypeBodies();
+  Error linkDefinedTypeBodies();
 
   /// Return the mapped type to use for the specified input type from the
   /// source module.
@@ -207,7 +213,7 @@ bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
   return true;
 }
 
-void TypeMapTy::linkDefinedTypeBodies() {
+Error TypeMapTy::linkDefinedTypeBodies() {
   SmallVector<Type *, 16> Elements;
   for (StructType *SrcSTy : SrcDefinitionsToResolve) {
     StructType *DstSTy = cast<StructType>(MappedTypes[SrcSTy]);
@@ -218,11 +224,13 @@ void TypeMapTy::linkDefinedTypeBodies() {
     for (unsigned I = 0, E = Elements.size(); I != E; ++I)
       Elements[I] = get(SrcSTy->getElementType(I));
 
-    DstSTy->setBody(Elements, SrcSTy->isPacked());
+    if (auto E = DstSTy->setBodyOrError(Elements, SrcSTy->isPacked()))
+      return E;
     DstStructTypesSet.switchToNonOpaque(DstSTy);
   }
   SrcDefinitionsToResolve.clear();
   DstResolvedOpaqueTypes.clear();
+  return Error::success();
 }
 
 void TypeMapTy::finishType(StructType *DTy, StructType *STy,
@@ -439,12 +447,6 @@ class IRLinker {
       FoundError = std::move(E);
   }
 
-  /// Most of the errors produced by this module are inconvertible StringErrors.
-  /// This convenience function lets us return one of those more easily.
-  Error stringErr(const Twine &T) {
-    return make_error<StringError>(T, inconvertibleErrorCode());
-  }
-
   /// Entry point for mapping values and alternate context for mapping aliases.
   ValueMapper Mapper;
   unsigned IndirectSymbolMCID;
@@ -875,7 +877,7 @@ void IRLinker::computeTypeMapping() {
 
   // Now that we have discovered all of the type equivalences, get a body for
   // any 'opaque' types in the dest module that are now resolved.
-  TypeMap.linkDefinedTypeBodies();
+  setError(TypeMap.linkDefinedTypeBodies());
 }
 
 static void getArrayElements(const Constant *C,
diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp
index 89668af..1622959 100644
--- a/llvm/lib/Support/KnownBits.cpp
+++ b/llvm/lib/Support/KnownBits.cpp
@@ -610,28 +610,82 @@ static KnownBits computeForSatAddSub(bool Add, bool Signed,
                                      const KnownBits &RHS) {
   // We don't see NSW even for sadd/ssub as we want to check if the result has
   // signed overflow.
-  KnownBits Res =
-      KnownBits::computeForAddSub(Add, /*NSW=*/false, /*NUW=*/false, LHS, RHS);
-  unsigned BitWidth = Res.getBitWidth();
-  auto SignBitKnown = [&](const KnownBits &K) {
-    return K.Zero[BitWidth - 1] || K.One[BitWidth - 1];
-  };
-  std::optional<bool> Overflow;
+  unsigned BitWidth = LHS.getBitWidth();
 
+  std::optional<bool> Overflow;
+  // Even if we can't entirely rule out overflow, we may be able to rule out
+  // overflow in one direction. This allows us to potentially keep some of the
+  // add/sub bits. I.e if we can't overflow in the positive direction we won't
+  // clamp to INT_MAX so we can keep low 0s from the add/sub result.
+  bool MayNegClamp = true;
+  bool MayPosClamp = true;
   if (Signed) {
-    // If we can actually detect overflow do so. Otherwise leave Overflow as
-    // nullopt (we assume it may have happened).
-    if (SignBitKnown(LHS) && SignBitKnown(RHS) && SignBitKnown(Res)) {
+    // Easy cases we can rule out any overflow.
+    if (Add && ((LHS.isNegative() && RHS.isNonNegative()) ||
+                (LHS.isNonNegative() && RHS.isNegative())))
+      Overflow = false;
+    else if (!Add && (((LHS.isNegative() && RHS.isNegative()) ||
+                       (LHS.isNonNegative() && RHS.isNonNegative()))))
+      Overflow = false;
+    else {
+      // Check if we may overflow. If we can't rule out overflow then check if
+      // we can rule out a direction at least.
+      KnownBits UnsignedLHS = LHS;
+      KnownBits UnsignedRHS = RHS;
+      // Get version of LHS/RHS with clearer signbit. This allows us to detect
+      // how the addition/subtraction might overflow into the signbit. Then
+      // using the actual known signbits of LHS/RHS, we can figure out which
+      // overflows are/aren't possible.
+      UnsignedLHS.One.clearSignBit();
+      UnsignedLHS.Zero.setSignBit();
+      UnsignedRHS.One.clearSignBit();
+      UnsignedRHS.Zero.setSignBit();
+      KnownBits Res =
+          KnownBits::computeForAddSub(Add, /*NSW=*/false,
+                                      /*NUW=*/false, UnsignedLHS, UnsignedRHS);
       if (Add) {
-        // sadd.sat
-        Overflow = (LHS.isNonNegative() == RHS.isNonNegative() &&
-                    Res.isNonNegative() != LHS.isNonNegative());
+        if (Res.isNegative()) {
+          // Only overflow scenario is Pos + Pos.
+          MayNegClamp = false;
+          // Pos + Pos will overflow with extra signbit.
+          if (LHS.isNonNegative() && RHS.isNonNegative())
+            Overflow = true;
+        } else if (Res.isNonNegative()) {
+          // Only overflow scenario is Neg + Neg
+          MayPosClamp = false;
+          // Neg + Neg will overflow without extra signbit.
+          if (LHS.isNegative() && RHS.isNegative())
+            Overflow = true;
+        }
+        // We will never clamp to the opposite sign of N-bit result.
+        if (LHS.isNegative() || RHS.isNegative())
+          MayPosClamp = false;
+        if (LHS.isNonNegative() || RHS.isNonNegative())
+          MayNegClamp = false;
       } else {
-        // ssub.sat
-        Overflow = (LHS.isNonNegative() != RHS.isNonNegative() &&
-                    Res.isNonNegative() != LHS.isNonNegative());
+        if (Res.isNegative()) {
+          // Only overflow scenario is Neg - Pos.
+          MayPosClamp = false;
+          // Neg - Pos will overflow with extra signbit.
+          if (LHS.isNegative() && RHS.isNonNegative())
+            Overflow = true;
+        } else if (Res.isNonNegative()) {
+          // Only overflow scenario is Pos - Neg.
+          MayNegClamp = false;
+          // Pos - Neg will overflow without extra signbit.
+          if (LHS.isNonNegative() && RHS.isNegative())
+            Overflow = true;
+        }
+        // We will never clamp to the opposite sign of N-bit result.
+        if (LHS.isNegative() || RHS.isNonNegative())
+          MayPosClamp = false;
+        if (LHS.isNonNegative() || RHS.isNegative())
+          MayNegClamp = false;
       }
     }
+    // If we have ruled out all clamping, we will never overflow.
+    if (!MayNegClamp && !MayPosClamp)
+      Overflow = false;
   } else if (Add) {
     // uadd.sat
     bool Of;
@@ -656,52 +710,8 @@ static KnownBits computeForSatAddSub(bool Add, bool Signed,
     }
   }
 
-  if (Signed) {
-    if (Add) {
-      if (LHS.isNonNegative() && RHS.isNonNegative()) {
-        // Pos + Pos -> Pos
-        Res.One.clearSignBit();
-        Res.Zero.setSignBit();
-      }
-      if (LHS.isNegative() && RHS.isNegative()) {
-        // Neg + Neg -> Neg
-        Res.One.setSignBit();
-        Res.Zero.clearSignBit();
-      }
-    } else {
-      if (LHS.isNegative() && RHS.isNonNegative()) {
-        // Neg - Pos -> Neg
-        Res.One.setSignBit();
-        Res.Zero.clearSignBit();
-      } else if (LHS.isNonNegative() && RHS.isNegative()) {
-        // Pos - Neg -> Pos
-        Res.One.clearSignBit();
-        Res.Zero.setSignBit();
-      }
-    }
-  } else {
-    // Add: Leading ones of either operand are preserved.
-    // Sub: Leading zeros of LHS and leading ones of RHS are preserved
-    // as leading zeros in the result.
-    unsigned LeadingKnown;
-    if (Add)
-      LeadingKnown =
-          std::max(LHS.countMinLeadingOnes(), RHS.countMinLeadingOnes());
-    else
-      LeadingKnown =
-          std::max(LHS.countMinLeadingZeros(), RHS.countMinLeadingOnes());
-
-    // We select between the operation result and all-ones/zero
-    // respectively, so we can preserve known ones/zeros.
-    APInt Mask = APInt::getHighBitsSet(BitWidth, LeadingKnown);
-    if (Add) {
-      Res.One |= Mask;
-      Res.Zero &= ~Mask;
-    } else {
-      Res.Zero |= Mask;
-      Res.One &= ~Mask;
-    }
-  }
+  KnownBits Res = KnownBits::computeForAddSub(Add, /*NSW=*/Signed,
+                                              /*NUW=*/!Signed, LHS, RHS);
 
   if (Overflow) {
     // We know whether or not we overflowed.
@@ -714,7 +724,7 @@ static KnownBits computeForSatAddSub(bool Add, bool Signed,
     APInt C;
     if (Signed) {
       // sadd.sat / ssub.sat
-      assert(SignBitKnown(LHS) &&
+      assert(!LHS.isSignUnknown() &&
              "We somehow know overflow without knowing input sign");
       C = LHS.isNegative() ? APInt::getSignedMinValue(BitWidth)
                            : APInt::getSignedMaxValue(BitWidth);
@@ -735,8 +745,10 @@ static KnownBits computeForSatAddSub(bool Add, bool Signed,
   if (Signed) {
     // sadd.sat/ssub.sat
     // We can keep our information about the sign bits.
-    Res.Zero.clearLowBits(BitWidth - 1);
-    Res.One.clearLowBits(BitWidth - 1);
+    if (MayPosClamp)
+      Res.Zero.clearLowBits(BitWidth - 1);
+    if (MayNegClamp)
+      Res.One.clearLowBits(BitWidth - 1);
   } else if (Add) {
     // uadd.sat
     // We need to clear all the known zeros as we can only use the leading ones.
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 1679534..8c256b5 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -1429,8 +1429,6 @@ class sve2_int_perm_revd<string asm>
   let Inst{4-0}   = Zd;
 
   let Constraints = "$Zd = $_Zd";
-  let DestructiveInstType = DestructiveUnary;
-  let ElementSize = ZPR128.ElementSize;
 }
 
 multiclass sve2_int_perm_revd<string asm, SDPatternOperator op> {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 86d8dbe..786baa6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1740,6 +1740,13 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
     MFI->setFlag(Info->VReg, Info->Flags);
   }
 
+  for (const auto &YamlRegStr : YamlMFI.SpillPhysVGPRS) {
+    Register ParsedReg;
+    if (parseRegister(YamlRegStr, ParsedReg))
+      return true;
+    MFI->SpillPhysVGPRs.push_back(ParsedReg);
+  }
+
   auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
                                    const TargetRegisterClass &RC,
                                    ArgDescriptor &Arg, unsigned UserSGPRs,
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index e59dd72..1e43d27 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -711,6 +711,9 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
       PSInputAddr(MFI.getPSInputAddr()),
       PSInputEnable(MFI.getPSInputEnable()),
       Mode(MFI.getMode()) {
+  for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
+    SpillPhysVGPRS.push_back(regToString(Reg, TRI));
+
   for (Register Reg : MFI.getWWMReservedRegs())
     WWMReservedRegs.push_back(regToString(Reg, TRI));
 
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index c8c305e..018322e 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -275,6 +275,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   // TODO: 10 may be a better default since it's the maximum.
   unsigned Occupancy = 0;
 
+  SmallVector<StringValue, 2> SpillPhysVGPRS;
   SmallVector<StringValue> WWMReservedRegs;
 
   StringValue ScratchRSrcReg = "$private_rsrc_reg";
@@ -336,6 +337,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
     YamlIO.mapOptional("highBitsOf32BitAddress",
                        MFI.HighBitsOf32BitAddress, 0u);
     YamlIO.mapOptional("occupancy", MFI.Occupancy, 0);
+    YamlIO.mapOptional("spillPhysVGPRs", MFI.SpillPhysVGPRS);
     YamlIO.mapOptional("wwmReservedRegs", MFI.WWMReservedRegs);
     YamlIO.mapOptional("scavengeFI", MFI.ScavengeFI);
     YamlIO.mapOptional("vgprForAGPRCopy", MFI.VGPRForAGPRCopy,
@@ -610,6 +612,7 @@ public:
   }
 
   ArrayRef<Register> getSGPRSpillVGPRs() const { return SpillVGPRs; }
+  ArrayRef<Register> getSGPRSpillPhysVGPRs() const { return SpillPhysVGPRs; }
 
   const WWMSpillsMap &getWWMSpills() const { return WWMSpills; }
   const ReservedRegSet &getWWMReservedRegs() const { return WWMReservedRegs; }
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 8b8884a..7074688 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2628,8 +2628,10 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         std::swap(FIOperandNum, OtherOpIdx);
       }
 
-      for (unsigned SrcIdx : {Src1Idx, Src0Idx}) {
-        // Depending on operand constraints we may need to insert another copy.
+      // We need at most one mov to satisfy the operand constraints. Prefer to
+      // move the FI operand first, as it may be a literal in a VOP3
+      // instruction.
+      for (unsigned SrcIdx : {FIOperandNum, OtherOpIdx}) {
         if (!TII->isOperandLegal(*MI, SrcIdx)) {
           // If commuting didn't make the operands legal, we need to materialize
           // in a register.
@@ -2648,6 +2650,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
           Src.ChangeToRegister(ScavengedVGPR, false);
           Src.setIsKill(true);
+          break;
         }
       }
 
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.h b/llvm/lib/Target/Mips/MipsISelLowering.h
index 2b18b299..8033898 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.h
+++ b/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -365,6 +365,8 @@ class TargetRegisterClass;
       return ABI.IsN64() ? Mips::A1_64 : Mips::A1;
     }
 
+    bool softPromoteHalfType() const override { return true; }
+
     bool isJumpTableRelative() const override {
       return getTargetMachine().isPositionIndependent();
     }
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index d3bf0ec..18b05b2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5694,10 +5694,6 @@ static SDValue PerformANDCombine(SDNode *N,
     Val = Val->getOperand(0);
   }
 
-  if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
-    Val = Val->getOperand(0);
-  }
-
   if (Val->getOpcode() == NVPTXISD::LoadV2 ||
       Val->getOpcode() == NVPTXISD::LoadV4) {
     ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index d2d03d4..43f08b4 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1333,7 +1333,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         // expansion to a build_vector of 0s.
         setOperationAction(ISD::UNDEF, VT, Custom);
 
-        setOperationAction({ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR,
+        setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
+                            ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR,
                             ISD::EXTRACT_SUBVECTOR, ISD::VECTOR_REVERSE,
                             ISD::VECTOR_SHUFFLE, ISD::VECTOR_COMPRESS},
                            VT, Custom);
@@ -1404,10 +1405,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
           continue;
         }
 
-        setOperationAction({ISD::BUILD_VECTOR,
-                            ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
-                            ISD::SCALAR_TO_VECTOR},
-                           VT, Custom);
+        setOperationAction({ISD::BUILD_VECTOR, ISD::SCALAR_TO_VECTOR}, VT,
+                           Custom);
 
         setOperationAction(
             {ISD::MLOAD, ISD::MSTORE, ISD::MGATHER, ISD::MSCATTER}, VT, Custom);
@@ -1449,6 +1448,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                          Custom);
       if (Subtarget.hasStdExtZfhminOrZhinxmin())
         setOperationAction(ISD::BITCAST, MVT::f16, Custom);
+      if (Subtarget.hasStdExtZfbfmin())
+        setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
       if (Subtarget.hasStdExtFOrZfinx())
         setOperationAction(ISD::BITCAST, MVT::f32, Custom);
       if (Subtarget.hasStdExtDOrZdinx())
@@ -2208,20 +2209,12 @@ bool RISCVTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
     return true;
 
   // Convervatively only handle extracting half of a vector.
+  // TODO: We can do arbitrary slidedowns, but for now only support extracting
+  // the upper half of a vector until we have more test coverage.
   // TODO: For sizes which aren't multiples of VLEN sizes, this may not be
   // a cheap extract.  However, this case is important in practice for
   // shuffled extracts of longer vectors.  How resolve?
-  if ((ResElts * 2) != SrcElts)
-    return false;
-
-  // Slide can support arbitrary index, but we only treat vslidedown.vi as
-  // cheap.
-  if (Index >= 32)
-    return false;
-
-  // TODO: We can do arbitrary slidedowns, but for now only support extracting
-  // the upper half of a vector until we have more test coverage.
-  return Index == 0 || Index == ResElts;
+  return (ResElts * 2) == SrcElts && (Index == 0 || Index == ResElts);
 }
 
 MVT RISCVTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
@@ -4817,6 +4810,24 @@ static SDValue lowerVECTOR_SHUFFLEAsVSlide1(const SDLoc &DL, MVT VT,
 
   MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
   auto [TrueMask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+
+  // zvfhmin and zvfbfmin don't have vfslide1{down,up}.vf so use fmv.x.h +
+  // vslide1{down,up}.vx instead.
+  if (VT.getVectorElementType() == MVT::bf16 ||
+      (VT.getVectorElementType() == MVT::f16 &&
+       !Subtarget.hasVInstructionsF16())) {
+    MVT IntVT = ContainerVT.changeVectorElementTypeToInteger();
+    Splat =
+        DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, Subtarget.getXLenVT(), Splat);
+    V2 = DAG.getBitcast(
+        IntVT, convertToScalableVector(ContainerVT, V2, DAG, Subtarget));
+    SDValue Vec = DAG.getNode(
+        IsVSlidedown ? RISCVISD::VSLIDE1DOWN_VL : RISCVISD::VSLIDE1UP_VL, DL,
+        IntVT, DAG.getUNDEF(IntVT), V2, Splat, TrueMask, VL);
+    Vec = DAG.getBitcast(ContainerVT, Vec);
+    return convertFromScalableVector(VT, Vec, DAG, Subtarget);
+  }
+
   auto OpCode = IsVSlidedown ?
     (VT.isFloatingPoint() ? RISCVISD::VFSLIDE1DOWN_VL : RISCVISD::VSLIDE1DOWN_VL) :
     (VT.isFloatingPoint() ? RISCVISD::VFSLIDE1UP_VL : RISCVISD::VSLIDE1UP_VL);
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index 790d86f..d529904 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -175,10 +175,13 @@ static void buildOpBitcast(SPIRVGlobalRegistry *GR, MachineIRBuilder &MIB,
   MachineRegisterInfo *MRI = MIB.getMRI();
   if (!MRI->getRegClassOrNull(ResVReg))
     MRI->setRegClass(ResVReg, GR->getRegClass(ResType));
-  MIB.buildInstr(SPIRV::OpBitcast)
-      .addDef(ResVReg)
-      .addUse(GR->getSPIRVTypeID(ResType))
-      .addUse(OpReg);
+  if (ResType == OpType)
+    MIB.buildInstr(TargetOpcode::COPY).addDef(ResVReg).addUse(OpReg);
+  else
+    MIB.buildInstr(SPIRV::OpBitcast)
+        .addDef(ResVReg)
+        .addUse(GR->getSPIRVTypeID(ResType))
+        .addUse(OpReg);
 }
 
 // We do instruction selections early instead of calling MIB.buildBitcast()
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
index 34854f3..194ce7c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -55,14 +55,14 @@ static std::string computeDataLayout(const Triple &TT) {
   // memory model used for graphics: PhysicalStorageBuffer64. But it shouldn't
   // mean anything.
   if (Arch == Triple::spirv32)
-    return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-"
-           "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1";
+    return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-"
+           "v256:256-v512:512-v1024:1024-n8:16:32:64-G1";
   if (TT.getVendor() == Triple::VendorType::AMD &&
       TT.getOS() == Triple::OSType::AMDHSA)
-    return "e-i64:64-v16:16-v24:32-v32:32-v48:64-"
-           "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1-P4-A0";
-  return "e-i64:64-v16:16-v24:32-v32:32-v48:64-"
-         "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1";
+    return "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-"
+           "v512:512-v1024:1024-n32:64-S32-G1-P4-A0";
+  return "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-"
+         "v512:512-v1024:1024-n8:16:32:64-G1";
 }
 
 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
diff --git a/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp b/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
index 1cda7f9..a0a2682 100644
--- a/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
@@ -78,7 +78,7 @@ bool Lowerer::lower(Function &F) {
       case Intrinsic::coro_end:
       case Intrinsic::coro_suspend_retcon:
         if (IsPrivateAndUnprocessed) {
-          II->replaceAllUsesWith(UndefValue::get(II->getType()));
+          II->replaceAllUsesWith(PoisonValue::get(II->getType()));
         } else
           continue;
         break;
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 070df42..efd6221 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -375,7 +375,7 @@ static void replaceFallthroughCoroEnd(AnyCoroEndInst *End,
     if (auto *RetStructTy = dyn_cast<StructType>(RetTy)) {
       assert(RetStructTy->getNumElements() == NumReturns &&
              "numbers of returns should match resume function singature");
-      Value *ReturnValue = UndefValue::get(RetStructTy);
+      Value *ReturnValue = PoisonValue::get(RetStructTy);
       unsigned Idx = 0;
       for (Value *RetValEl : CoroResults->return_values())
         ReturnValue = Builder.CreateInsertValue(ReturnValue, RetValEl, Idx++);
@@ -406,7 +406,7 @@ static void replaceFallthroughCoroEnd(AnyCoroEndInst *End,
 
     Value *ReturnValue = ConstantPointerNull::get(ContinuationTy);
     if (RetStructTy) {
-      ReturnValue = Builder.CreateInsertValue(UndefValue::get(RetStructTy),
+      ReturnValue = Builder.CreateInsertValue(PoisonValue::get(RetStructTy),
                                               ReturnValue, 0);
     }
     Builder.CreateRet(ReturnValue);
@@ -1304,7 +1304,7 @@ static void handleNoSuspendCoroutine(coro::Shape &Shape) {
   case coro::ABI::Async:
   case coro::ABI::Retcon:
   case coro::ABI::RetconOnce:
-    CoroBegin->replaceAllUsesWith(UndefValue::get(CoroBegin->getType()));
+    CoroBegin->replaceAllUsesWith(PoisonValue::get(CoroBegin->getType()));
     break;
   }
 
@@ -1758,7 +1758,7 @@ static void replaceAsyncResumeFunction(CoroSuspendAsyncInst *Suspend,
   ResumeIntrinsic->replaceAllUsesWith(Val);
   ResumeIntrinsic->eraseFromParent();
   Suspend->setOperand(CoroSuspendAsyncInst::ResumeFunctionArg,
-                      UndefValue::get(Int8PtrTy));
+                      PoisonValue::get(Int8PtrTy));
 }
 
 /// Coerce the arguments in \p FnArgs according to \p FnTy in \p CallArgs.
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index 45b9767..9032cad 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -353,18 +353,18 @@ void coro::Shape::invalidateCoroutine(
   assert(!CoroBegin);
   {
     // Replace coro.frame which are supposed to be lowered to the result of
-    // coro.begin with undef.
-    auto *Undef = UndefValue::get(PointerType::get(F.getContext(), 0));
+    // coro.begin with poison.
+    auto *Poison = PoisonValue::get(PointerType::get(F.getContext(), 0));
     for (CoroFrameInst *CF : CoroFrames) {
-      CF->replaceAllUsesWith(Undef);
+      CF->replaceAllUsesWith(Poison);
       CF->eraseFromParent();
     }
     CoroFrames.clear();
 
-    // Replace all coro.suspend with undef and remove related coro.saves if
+    // Replace all coro.suspend with poison and remove related coro.saves if
     // present.
     for (AnyCoroSuspendInst *CS : CoroSuspends) {
-      CS->replaceAllUsesWith(UndefValue::get(CS->getType()));
+      CS->replaceAllUsesWith(PoisonValue::get(CS->getType()));
       CS->eraseFromParent();
       if (auto *CoroSave = CS->getCoroSave())
         CoroSave->eraseFromParent();
diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index ac3f2ba..d540e6c 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -336,9 +336,9 @@ doPromotion(Function *F, FunctionAnalysisManager &FAM,
     }
 
     // There potentially are metadata uses for things like llvm.dbg.value.
-    // Replace them with undef, after handling the other regular uses.
-    auto RauwUndefMetadata = make_scope_exit(
-        [&]() { Arg.replaceAllUsesWith(UndefValue::get(Arg.getType())); });
+    // Replace them with poison, after handling the other regular uses.
+    auto RauwPoisonMetadata = make_scope_exit(
+        [&]() { Arg.replaceAllUsesWith(PoisonValue::get(Arg.getType())); });
 
     if (Arg.use_empty())
       continue;
diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index f2856ae..7f83834 100644
--- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -831,9 +831,7 @@ static bool expandUDivOrURem(BinaryOperator *Instr, const ConstantRange &XCR,
 
   // Even if we don't know X's range, the divisor may be so large, X can't ever
   // be 2x larger than that. I.e. if divisor is always negative.
-  if (!XCR.icmp(ICmpInst::ICMP_ULT,
-                YCR.umul_sat(APInt(YCR.getBitWidth(), 2))) &&
-      !YCR.isAllNegative())
+  if (!XCR.icmp(ICmpInst::ICMP_ULT, YCR.uadd_sat(YCR)) && !YCR.isAllNegative())
     return false;
 
   IRBuilder<> B(Instr);
diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index a1dbb4e..cd4846e 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -964,33 +964,26 @@ private:
   bool overridingStores(const ParseMemoryInst &Earlier,
                         const ParseMemoryInst &Later);
 
-  Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
-    // TODO: We could insert relevant casts on type mismatch here.
-    if (auto *LI = dyn_cast<LoadInst>(Inst))
-      return LI->getType() == ExpectedType ? LI : nullptr;
-    if (auto *SI = dyn_cast<StoreInst>(Inst)) {
-      Value *V = SI->getValueOperand();
-      return V->getType() == ExpectedType ? V : nullptr;
+  Value *getOrCreateResult(Instruction *Inst, Type *ExpectedType) const {
+    // TODO: We could insert relevant casts on type mismatch.
+    // The load or the store's first operand.
+    Value *V;
+    if (auto *II = dyn_cast<IntrinsicInst>(Inst)) {
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::masked_load:
+        V = II;
+        break;
+      case Intrinsic::masked_store:
+        V = II->getOperand(0);
+        break;
+      default:
+        return TTI.getOrCreateResultFromMemIntrinsic(II, ExpectedType);
+      }
+    } else {
+      V = isa<LoadInst>(Inst) ? Inst : cast<StoreInst>(Inst)->getValueOperand();
     }
-    assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
-    auto *II = cast<IntrinsicInst>(Inst);
-    if (isHandledNonTargetIntrinsic(II->getIntrinsicID()))
-      return getOrCreateResultNonTargetMemIntrinsic(II, ExpectedType);
-    return TTI.getOrCreateResultFromMemIntrinsic(II, ExpectedType);
-  }
 
-  Value *getOrCreateResultNonTargetMemIntrinsic(IntrinsicInst *II,
-                                                Type *ExpectedType) const {
-    // TODO: We could insert relevant casts on type mismatch here.
-    switch (II->getIntrinsicID()) {
-    case Intrinsic::masked_load:
-      return II->getType() == ExpectedType ? II : nullptr;
-    case Intrinsic::masked_store: {
-      Value *V = II->getOperand(0);
-      return V->getType() == ExpectedType ? V : nullptr;
-    }
-    }
-    return nullptr;
+    return V->getType() == ExpectedType ? V : nullptr;
   }
 
   /// Return true if the instruction is known to only operate on memory
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 0f04ff9..d51d043 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -2663,15 +2663,15 @@ LSRInstance::OptimizeLoopTermCond() {
     // Conservatively avoid trying to use the post-inc value in non-latch
     // exits if there may be pre-inc users in intervening blocks.
     if (LatchBlock != ExitingBlock)
-      for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
+      for (const IVStrideUse &UI : IU)
         // Test if the use is reachable from the exiting block. This dominator
         // query is a conservative approximation of reachability.
-        if (&*UI != CondUse &&
-            !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
+        if (&UI != CondUse &&
+            !DT.properlyDominates(UI.getUser()->getParent(), ExitingBlock)) {
           // Conservatively assume there may be reuse if the quotient of their
           // strides could be a legal scale.
           const SCEV *A = IU.getStride(*CondUse, L);
-          const SCEV *B = IU.getStride(*UI, L);
+          const SCEV *B = IU.getStride(UI, L);
           if (!A || !B) continue;
           if (SE.getTypeSizeInBits(A->getType()) !=
               SE.getTypeSizeInBits(B->getType())) {
@@ -2692,9 +2692,9 @@ LSRInstance::OptimizeLoopTermCond() {
                 C->getValue().isMinSignedValue())
               goto decline_post_inc;
             // Check for possible scaled-address reuse.
-            if (isAddressUse(TTI, UI->getUser(), UI->getOperandValToReplace())) {
-              MemAccessTy AccessTy = getAccessType(
-                  TTI, UI->getUser(), UI->getOperandValToReplace());
+            if (isAddressUse(TTI, UI.getUser(), UI.getOperandValToReplace())) {
+              MemAccessTy AccessTy =
+                  getAccessType(TTI, UI.getUser(), UI.getOperandValToReplace());
               int64_t Scale = C->getSExtValue();
               if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
                                             /*BaseOffset=*/0,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1c64bd2..35113c20 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -470,18 +470,14 @@ public:
                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
                       VPlan &Plan)
       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
-        AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
+        AC(AC), ORE(ORE), VF(VecWidth),
+        MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
         PSI(PSI), RTChecks(RTChecks), Plan(Plan) {
     // Query this against the original loop and save it here because the profile
     // of the original loop header may change as the transformation happens.
     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
-
-    if (MinProfitableTripCount.isZero())
-      this->MinProfitableTripCount = VecWidth;
-    else
-      this->MinProfitableTripCount = MinProfitableTripCount;
   }
 
   virtual ~InnerLoopVectorizer() = default;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3c6daf7..4454eb3 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7046,7 +7046,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
                 OrdersType Order;
                 SmallVector<Value *> PointerOps;
                 // Segmented load detected - vectorize at maximum vector factor.
-                if (TTI.isLegalInterleavedAccessType(
+                if (InterleaveFactor <= Slice.size() &&
+                    TTI.isLegalInterleavedAccessType(
                         getWidenedType(Slice.front()->getType(), VF),
                         InterleaveFactor,
                         cast<LoadInst>(Slice.front())->getAlign(),
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 025234c..332c520 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1032,9 +1032,15 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
 /// a vector into vector operations followed by extract. Note: The SLP pass
 /// may miss this pattern because of implementation problems.
 bool VectorCombine::foldExtractedCmps(Instruction &I) {
+  auto *BI = dyn_cast<BinaryOperator>(&I);
+
   // We are looking for a scalar binop of booleans.
   // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
-  if (!I.isBinaryOp() || !I.getType()->isIntegerTy(1))
+  if (!BI || !I.getType()->isIntegerTy(1))
+    return false;
+
+  // TODO: Support non-commutative binary ops.
+  if (!BI->isCommutative())
     return false;
 
   // The compare predicates should match, and each compare should have a
@@ -1113,8 +1119,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
   Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
 
   Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
-  Value *VecLogic = Builder.CreateBinOp(cast<BinaryOperator>(I).getOpcode(),
-                                        VCmp, Shuf);
+  Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), VCmp, Shuf);
   Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
   replaceValue(I, *NewExt);
   ++NumVecCmpBO;
diff --git a/llvm/test/Analysis/ValueTracking/known-non-equal.ll b/llvm/test/Analysis/ValueTracking/known-non-equal.ll
index d67f1b5..cbc61b3 100644
--- a/llvm/test/Analysis/ValueTracking/known-non-equal.ll
+++ b/llvm/test/Analysis/ValueTracking/known-non-equal.ll
@@ -316,6 +316,56 @@ exit:
   ret i1 %cmp
 }
 
+define i1 @known_non_equal_phis_max_recursion_limit(i1 %cond, i32 %switch.cond) {
+; CHECK-LABEL: @known_non_equal_phis_max_recursion_limit(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BB0:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[PHIA_0:%.*]] = phi i32 [ [[PHIA_1:%.*]], [[BB1:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[PHIB_0:%.*]] = phi i32 [ [[PHIB_1:%.*]], [[BB1]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[SWITCH_BLOCK:%.*]], label [[EXIT:%.*]]
+; CHECK:       switch.block:
+; CHECK-NEXT:    switch i32 [[SWITCH_COND:%.*]], label [[BB1]] [
+; CHECK-NEXT:      i32 0, label [[EPILOGUE:%.*]]
+; CHECK-NEXT:      i32 1, label [[EPILOGUE]]
+; CHECK-NEXT:    ]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[PHIA_1]] = phi i32 [ [[PHIA_0]], [[SWITCH_BLOCK]] ], [ 0, [[EPILOGUE]] ]
+; CHECK-NEXT:    [[PHIB_1]] = phi i32 [ [[PHIB_0]], [[SWITCH_BLOCK]] ], [ 0, [[EPILOGUE]] ]
+; CHECK-NEXT:    br label [[BB0]]
+; CHECK:       epilogue:
+; CHECK-NEXT:    br label [[BB1]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq i32 [[PHIA_0]], [[PHIB_0]]
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+entry:
+  br label %bb0
+
+bb0:
+  %phiA.0 = phi i32 [ %phiA.1, %bb1 ], [ 0, %entry ]
+  %phiB.0 = phi i32 [ %phiB.1, %bb1 ], [ 0, %entry ]
+  br i1 %cond, label %switch.block, label %exit
+
+switch.block:
+  switch i32 %switch.cond, label %bb1 [
+  i32 0, label %epilogue
+  i32 1, label %epilogue
+  ]
+
+bb1:
+  %phiA.1 = phi i32 [ %phiA.0, %switch.block ], [ 0, %epilogue ]
+  %phiB.1 = phi i32 [ %phiB.0, %switch.block ], [ 0, %epilogue ]
+  br label %bb0
+
+epilogue:
+  br label %bb1
+
+exit:
+  %ret = icmp eq i32 %phiA.0, %phiB.0
+  ret i1 %ret
+}
+
 define i1 @known_non_equal_phis_fail(i8 %p, ptr %pq, i8 %n, i8 %r) {
 ; CHECK-LABEL: @known_non_equal_phis_fail(
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Analysis/ValueTracking/knownbits-sat-addsub.ll b/llvm/test/Analysis/ValueTracking/knownbits-sat-addsub.ll
index c2926ea..f9618e1 100644
--- a/llvm/test/Analysis/ValueTracking/knownbits-sat-addsub.ll
+++ b/llvm/test/Analysis/ValueTracking/knownbits-sat-addsub.ll
@@ -142,14 +142,7 @@ define i1 @ssub_sat_low_bits(i8 %x, i8 %y) {
 
 define i1 @ssub_sat_fail_may_overflow(i8 %x, i8 %y) {
 ; CHECK-LABEL: @ssub_sat_fail_may_overflow(
-; CHECK-NEXT:    [[XX:%.*]] = and i8 [[X:%.*]], 15
-; CHECK-NEXT:    [[YY:%.*]] = and i8 [[Y:%.*]], 15
-; CHECK-NEXT:    [[LHS:%.*]] = or i8 [[XX]], 1
-; CHECK-NEXT:    [[RHS:%.*]] = and i8 [[YY]], -2
-; CHECK-NEXT:    [[EXP:%.*]] = call i8 @llvm.ssub.sat.i8(i8 [[LHS]], i8 [[RHS]])
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[EXP]], 1
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[AND]], 0
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %xx = and i8 %x, 15
   %yy = and i8 %y, 15
diff --git a/llvm/test/Assembler/mutually-recursive-types.ll b/llvm/test/Assembler/mutually-recursive-types.ll
new file mode 100644
index 0000000..e030d8e
--- /dev/null
+++ b/llvm/test/Assembler/mutually-recursive-types.ll
@@ -0,0 +1,7 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+; CHECK: error: identified structure type 'rt3' is recursive
+
+%rt1 = type { i32, { i8, %rt2, i8 }, i32 }
+%rt2 = type { i64, { i6, %rt3 } }
+%rt3 = type { %rt1 }
diff --git a/llvm/test/Assembler/unsized-recursive-type.ll b/llvm/test/Assembler/unsized-recursive-type.ll
index e849e91..76c8d01 100644
--- a/llvm/test/Assembler/unsized-recursive-type.ll
+++ b/llvm/test/Assembler/unsized-recursive-type.ll
@@ -1,9 +1,5 @@
 ; RUN: not llvm-as < %s 2>&1 | FileCheck %s
 
-; CHECK: base element of getelementptr must be sized
+; CHECK: error: identified structure type 'myTy' is recursive
 
 %myTy = type { %myTy }
-define void @foo(ptr %p){
-  getelementptr %myTy, ptr %p, i64 1
-  ret void
-}
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
index e9fbaf6..725c44c 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
@@ -8,7 +8,7 @@ declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>)
 declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
 declare half @llvm.fma.f16(half, half, half) #1
 
-define dso_local <4 x half> @t_vfma_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) {
+define <4 x half> @t_vfma_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfma_lane_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -20,7 +20,7 @@ entry:
   ret <4 x half> %fmla3
 }
 
-define dso_local <8 x half> @t_vfmaq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c, i32 %lane) {
+define <8 x half> @t_vfmaq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmaq_lane_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -32,7 +32,7 @@ entry:
   ret <8 x half> %fmla3
 }
 
-define dso_local <4 x half> @t_vfma_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c, i32 %lane) {
+define <4 x half> @t_vfma_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfma_laneq_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmla v0.4h, v1.4h, v2.h[0]
@@ -43,7 +43,7 @@ entry:
   ret <4 x half> %0
 }
 
-define dso_local <8 x half> @t_vfmaq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i32 %lane) {
+define <8 x half> @t_vfmaq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmaq_laneq_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmla v0.8h, v1.8h, v2.h[0]
@@ -54,7 +54,7 @@ entry:
   ret <8 x half> %0
 }
 
-define dso_local <4 x half> @t_vfma_n_f16(<4 x half> %a, <4 x half> %b, half %c) {
+define <4 x half> @t_vfma_n_f16(<4 x half> %a, <4 x half> %b, half %c) {
 ; CHECK-LABEL: t_vfma_n_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
@@ -67,7 +67,7 @@ entry:
   ret <4 x half> %0
 }
 
-define dso_local <8 x half> @t_vfmaq_n_f16(<8 x half> %a, <8 x half> %b, half %c) {
+define <8 x half> @t_vfmaq_n_f16(<8 x half> %a, <8 x half> %b, half %c) {
 ; CHECK-LABEL: t_vfmaq_n_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
@@ -80,7 +80,7 @@ entry:
   ret <8 x half> %0
 }
 
-define dso_local half @t_vfmah_lane_f16_0(half %a, half %b, <4 x half> %c, i32 %lane) {
+define half @t_vfmah_lane_f16_0(half %a, half %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmah_lane_f16_0:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -92,7 +92,7 @@ entry:
   ret half %0
 }
 
-define dso_local half @t_vfmah_lane_f16_0_swap(half %a, half %b, <4 x half> %c, i32 %lane) {
+define half @t_vfmah_lane_f16_0_swap(half %a, half %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmah_lane_f16_0_swap:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -104,7 +104,7 @@ entry:
   ret half %0
 }
 
-define dso_local half @t_vfmah_lane_f16_3(half %a, half %b, <4 x half> %c, i32 %lane) {
+define half @t_vfmah_lane_f16_3(half %a, half %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmah_lane_f16_3:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -116,7 +116,35 @@ entry:
   ret half %0
 }
 
-define dso_local half @t_vfmah_laneq_f16_0(half %a, half %b, <8 x half> %c, i32 %lane) {
+define half @t_vfmah_lane_f16_3_0(half %a, <4 x half> %c) {
+; CHECK-LABEL: t_vfmah_lane_f16_3_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov h2, v1.h[3]
+; CHECK-NEXT:    fmadd h0, h1, h2, h0
+; CHECK-NEXT:    ret
+entry:
+  %b = extractelement <4 x half> %c, i32 0
+  %extract = extractelement <4 x half> %c, i32 3
+  %0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a)
+  ret half %0
+}
+
+define half @t_vfmah_lane_f16_0_0(half %a, <4 x half> %b, <4 x half> %c) {
+; CHECK-LABEL: t_vfmah_lane_f16_0_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    fmadd h0, h1, h2, h0
+; CHECK-NEXT:    ret
+entry:
+  %b0 = extractelement <4 x half> %b, i32 0
+  %c0 = extractelement <4 x half> %c, i32 0
+  %0 = tail call half @llvm.fma.f16(half %b0, half %c0, half %a)
+  ret half %0
+}
+
+define half @t_vfmah_laneq_f16_0(half %a, half %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmah_laneq_f16_0:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmadd h0, h1, h2, h0
@@ -127,7 +155,7 @@ entry:
   ret half %0
 }
 
-define dso_local half @t_vfmah_laneq_f16_0_swap(half %a, half %b, <8 x half> %c, i32 %lane) {
+define half @t_vfmah_laneq_f16_0_swap(half %a, half %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmah_laneq_f16_0_swap:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmadd h0, h2, h1, h0
@@ -138,7 +166,7 @@ entry:
   ret half %0
 }
 
-define dso_local half @t_vfmah_laneq_f16_7(half %a, half %b, <8 x half> %c, i32 %lane) {
+define half @t_vfmah_laneq_f16_7(half %a, half %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmah_laneq_f16_7:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmla h0, h1, v2.h[7]
@@ -149,7 +177,7 @@ entry:
   ret half %0
 }
 
-define dso_local <4 x half> @t_vfms_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) {
+define <4 x half> @t_vfms_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfms_lane_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -162,7 +190,7 @@ entry:
   ret <4 x half> %fmla3
 }
 
-define dso_local <8 x half> @t_vfmsq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c, i32 %lane) {
+define <8 x half> @t_vfmsq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsq_lane_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -175,7 +203,7 @@ entry:
   ret <8 x half> %fmla3
 }
 
-define dso_local <4 x half> @t_vfms_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c, i32 %lane) {
+define <4 x half> @t_vfms_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfms_laneq_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmls v0.4h, v1.4h, v2.h[0]
@@ -187,7 +215,7 @@ entry:
   ret <4 x half> %0
 }
 
-define dso_local <8 x half> @t_vfmsq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i32 %lane) {
+define <8 x half> @t_vfmsq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsq_laneq_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmls v0.8h, v1.8h, v2.h[0]
@@ -199,7 +227,7 @@ entry:
   ret <8 x half> %0
 }
 
-define dso_local <4 x half> @t_vfms_n_f16(<4 x half> %a, <4 x half> %b, half %c) {
+define <4 x half> @t_vfms_n_f16(<4 x half> %a, <4 x half> %b, half %c) {
 ; CHECK-LABEL: t_vfms_n_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
@@ -213,7 +241,7 @@ entry:
   ret <4 x half> %0
 }
 
-define dso_local <8 x half> @t_vfmsq_n_f16(<8 x half> %a, <8 x half> %b, half %c) {
+define <8 x half> @t_vfmsq_n_f16(<8 x half> %a, <8 x half> %b, half %c) {
 ; CHECK-LABEL: t_vfmsq_n_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
@@ -227,7 +255,7 @@ entry:
   ret <8 x half> %0
 }
 
-define dso_local half @t_vfmsh_lane_f16_0(half %a, half %b, <4 x half> %c, i32 %lane) {
+define half @t_vfmsh_lane_f16_0(half %a, half %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsh_lane_f16_0:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -240,7 +268,7 @@ entry:
   ret half %1
 }
 
-define dso_local half @t_vfmsh_lane_f16_0_swap(half %a, half %b, <4 x half> %c, i32 %lane) {
+define half @t_vfmsh_lane_f16_0_swap(half %a, half %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsh_lane_f16_0_swap:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -253,7 +281,7 @@ entry:
   ret half %1
 }
 
-define dso_local half @t_vfmsh_lane_f16_3(half %a, half %b, <4 x half> %c, i32 %lane) {
+define half @t_vfmsh_lane_f16_3(half %a, half %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsh_lane_f16_3:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -266,7 +294,7 @@ entry:
   ret half %1
 }
 
-define dso_local half @t_vfmsh_laneq_f16_0(half %a, half %b, <8 x half> %c, i32 %lane) {
+define half @t_vfmsh_laneq_f16_0(half %a, half %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsh_laneq_f16_0:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmsub h0, h2, h1, h0
@@ -278,7 +306,22 @@ entry:
   ret half %1
 }
 
-define dso_local half @t_vfmsh_laneq_f16_0_swap(half %a, half %b, <8 x half> %c, i32 %lane) {
+define half @t_vfmsh_lane_f16_0_3(half %a, <4 x half> %c, i32 %lane) {
+; CHECK-LABEL: t_vfmsh_lane_f16_0_3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov h2, v1.h[3]
+; CHECK-NEXT:    fmsub h0, h2, h1, h0
+; CHECK-NEXT:    ret
+entry:
+  %b = extractelement <4 x half> %c, i32 0
+  %0 = fsub half 0xH8000, %b
+  %extract = extractelement <4 x half> %c, i32 3
+  %1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a)
+  ret half %1
+}
+
+define half @t_vfmsh_laneq_f16_0_swap(half %a, half %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsh_laneq_f16_0_swap:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmsub h0, h2, h1, h0
@@ -290,7 +333,7 @@ entry:
   ret half %1
 }
 
-define dso_local half @t_vfmsh_laneq_f16_7(half %a, half %b, <8 x half> %c, i32 %lane) {
+define half @t_vfmsh_laneq_f16_7(half %a, half %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsh_laneq_f16_7:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmls h0, h1, v2.h[7]
@@ -302,7 +345,7 @@ entry:
   ret half %1
 }
 
-define dso_local <4 x half> @t_vmul_laneq_f16(<4 x half> %a, <8 x half> %b, i32 %lane) {
+define <4 x half> @t_vmul_laneq_f16(<4 x half> %a, <8 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmul_laneq_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmul v0.4h, v0.4h, v1.h[0]
@@ -313,7 +356,7 @@ entry:
   ret <4 x half> %mul
 }
 
-define dso_local <8 x half> @t_vmulq_laneq_f16(<8 x half> %a, <8 x half> %b, i32 %lane) {
+define <8 x half> @t_vmulq_laneq_f16(<8 x half> %a, <8 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulq_laneq_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmul v0.8h, v0.8h, v1.h[0]
@@ -324,7 +367,7 @@ entry:
   ret <8 x half> %mul
 }
 
-define dso_local half @t_vmulh_lane0_f16(half %a, <4 x half> %c, i32 %lane) {
+define half @t_vmulh_lane0_f16(half %a, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vmulh_lane0_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
@@ -336,7 +379,7 @@ entry:
   ret half %1
 }
 
-define dso_local half @t_vmulh_lane3_f16(half %a, <4 x half> %c, i32 %lane) {
+define half @t_vmulh_lane3_f16(half %a, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vmulh_lane3_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
@@ -348,7 +391,7 @@ entry:
   ret half %1
 }
 
-define dso_local half @t_vmulh_laneq0_f16(half %a, <8 x half> %c, i32 %lane) {
+define half @t_vmulh_laneq0_f16(half %a, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vmulh_laneq0_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmul h0, h0, h1
@@ -359,7 +402,7 @@ entry:
   ret half %1
 }
 
-define dso_local half @t_vmulh_laneq7_f16(half %a, <8 x half> %c, i32 %lane) {
+define half @t_vmulh_laneq7_f16(half %a, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vmulh_laneq7_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmul h0, h0, v1.h[7]
@@ -370,7 +413,7 @@ entry:
   ret half %1
 }
 
-define dso_local half @t_vmulx_f16(half %a, half %b) {
+define half @t_vmulx_f16(half %a, half %b) {
 ; CHECK-LABEL: t_vmulx_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmulx h0, h0, h1
@@ -380,7 +423,7 @@ entry:
   ret half %fmulx.i
 }
 
-define dso_local half @t_vmulxh_lane0_f16(half %a, <4 x half> %b) {
+define half @t_vmulxh_lane0_f16(half %a, <4 x half> %b) {
 ; CHECK-LABEL: t_vmulxh_lane0_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
@@ -392,7 +435,7 @@ entry:
   ret half %fmulx.i
 }
 
-define dso_local half @t_vmulxh_lane3_f16(half %a, <4 x half> %b, i32 %lane) {
+define half @t_vmulxh_lane3_f16(half %a, <4 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulxh_lane3_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
@@ -404,7 +447,7 @@ entry:
   ret half %fmulx.i
 }
 
-define dso_local <4 x half> @t_vmulx_lane_f16(<4 x half> %a, <4 x half> %b, i32 %lane) {
+define <4 x half> @t_vmulx_lane_f16(<4 x half> %a, <4 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulx_lane_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
@@ -416,7 +459,7 @@ entry:
   ret <4 x half> %vmulx2.i
 }
 
-define dso_local <8 x half> @t_vmulxq_lane_f16(<8 x half> %a, <4 x half> %b, i32 %lane) {
+define <8 x half> @t_vmulxq_lane_f16(<8 x half> %a, <4 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulxq_lane_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
@@ -428,7 +471,7 @@ entry:
   ret <8 x half> %vmulx2.i
 }
 
-define dso_local <4 x half> @t_vmulx_laneq_f16(<4 x half> %a, <8 x half> %b, i32 %lane) {
+define <4 x half> @t_vmulx_laneq_f16(<4 x half> %a, <8 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulx_laneq_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmulx v0.4h, v0.4h, v1.h[0]
@@ -439,7 +482,7 @@ entry:
   ret <4 x half> %vmulx2.i
 }
 
-define dso_local <8 x half> @t_vmulxq_laneq_f16(<8 x half> %a, <8 x half> %b, i32 %lane) {
+define <8 x half> @t_vmulxq_laneq_f16(<8 x half> %a, <8 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulxq_laneq_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmulx v0.8h, v0.8h, v1.h[0]
@@ -450,7 +493,7 @@ entry:
   ret <8 x half> %vmulx2.i
 }
 
-define dso_local half @t_vmulxh_laneq0_f16(half %a, <8 x half> %b) {
+define half @t_vmulxh_laneq0_f16(half %a, <8 x half> %b) {
 ; CHECK-LABEL: t_vmulxh_laneq0_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmulx h0, h0, h1
@@ -461,7 +504,7 @@ entry:
   ret half %fmulx.i
 }
 
-define dso_local half @t_vmulxh_laneq7_f16(half %a, <8 x half> %b, i32 %lane) {
+define half @t_vmulxh_laneq7_f16(half %a, <8 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulxh_laneq7_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmulx h0, h0, v1.h[7]
@@ -472,7 +515,7 @@ entry:
   ret half %fmulx.i
 }
 
-define dso_local <4 x half> @t_vmulx_n_f16(<4 x half> %a, half %c) {
+define <4 x half> @t_vmulx_n_f16(<4 x half> %a, half %c) {
 ; CHECK-LABEL: t_vmulx_n_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h1 killed $h1 def $q1
@@ -486,7 +529,7 @@ entry:
   ret <4 x half> %vmulx2.i
 }
 
-define dso_local <8 x half> @t_vmulxq_n_f16(<8 x half> %a, half %c) {
+define <8 x half> @t_vmulxq_n_f16(<8 x half> %a, half %c) {
 ; CHECK-LABEL: t_vmulxq_n_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h1 killed $h1 def $q1
@@ -500,7 +543,7 @@ entry:
   ret <8 x half> %vmulx2.i
 }
 
-define dso_local half @t_vfmah_lane3_f16(half %a, half %b, <4 x half> %c) {
+define half @t_vfmah_lane3_f16(half %a, half %b, <4 x half> %c) {
 ; CHECK-LABEL: t_vfmah_lane3_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -512,7 +555,7 @@ entry:
   ret half %0
 }
 
-define dso_local half @t_vfmah_laneq7_f16(half %a, half %b, <8 x half> %c) {
+define half @t_vfmah_laneq7_f16(half %a, half %b, <8 x half> %c) {
 ; CHECK-LABEL: t_vfmah_laneq7_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmla h0, h1, v2.h[7]
@@ -523,7 +566,7 @@ entry:
   ret half %0
 }
 
-define dso_local half @t_vfmsh_lane3_f16(half %a, half %b, <4 x half> %c) {
+define half @t_vfmsh_lane3_f16(half %a, half %b, <4 x half> %c) {
 ; CHECK-LABEL: t_vfmsh_lane3_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
@@ -536,7 +579,7 @@ entry:
   ret half %1
 }
 
-define dso_local half @t_vfmsh_laneq7_f16(half %a, half %b, <8 x half> %c) {
+define half @t_vfmsh_laneq7_f16(half %a, half %b, <8 x half> %c) {
 ; CHECK-LABEL: t_vfmsh_laneq7_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmls h0, h1, v2.h[7]
@@ -548,7 +591,7 @@ entry:
   ret half %1
 }
 
-define dso_local half @t_fadd_vfmah_f16(half %a, half %b, <4 x half> %c, <4 x half> %d) {
+define half @t_fadd_vfmah_f16(half %a, half %b, <4 x half> %c, <4 x half> %d) {
 ; CHECK-LABEL: t_fadd_vfmah_f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fadd v2.4h, v2.4h, v3.4h
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index 543605a..3fa5d64 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -465,3 +465,17 @@ define <2 x fp128> @load_v2f128(ptr %p) {
   %a = load <2 x fp128>, ptr %p
   ret <2 x fp128> %a
 }
+
+define i32 @load_i8_s16_extrasuse(ptr %ptr, ptr %ptr2) {
+; CHECK-LABEL: load_i8_s16_extrasuse:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    sxtb w0, w8
+; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    ret
+  %a = load i32, ptr %ptr
+  %s = shl i32 %a, 24
+  %b = ashr i32 %s, 24
+  store i32 %a, ptr %ptr2
+  ret i32 %b
+}
diff --git a/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll b/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
index ed88293..b2ea6ff 100644
--- a/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
+++ b/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 attributes #0 = { strictfp }
@@ -8,112 +9,252 @@ declare float @llvm.experimental.constrained.fma.f32(float, float, float, metada
 declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata)
 
 define float @test_fmla_ss4S_0(float %a, float %b, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmla_ss4S_0
-  ; CHECK: fmadd s0, s1, s2, s0
+; CHECK-LABEL: test_fmla_ss4S_0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd s0, s1, s2, s0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 0
   %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
   ret float %tmp2
 }
 
 define float @test_fmla_ss4S_0_swap(float %a, float %b, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmla_ss4S_0_swap
-  ; CHECK: fmadd s0, s2, s1, s0
+; CHECK-LABEL: test_fmla_ss4S_0_swap:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 0
   %tmp2 = call float @llvm.fma.f32(float %tmp1, float %b, float %a)
   ret float %tmp2
 }
 
 define float @test_fmla_ss4S_3(float %a, float %b, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmla_ss4S_3
-  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-LABEL: test_fmla_ss4S_3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla s0, s1, v2.s[3]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
   ret float %tmp2
 }
 
 define float @test_fmla_ss4S_3_swap(float %a, float %b, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmla_ss4S_3_swap
-  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-LABEL: test_fmla_ss4S_3_swap:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla s0, s0, v2.s[3]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = call float @llvm.fma.f32(float %tmp1, float %a, float %a)
   ret float %tmp2
 }
 
 define float @test_fmla_ss2S_0(float %a, float %b, <2 x float> %v) {
-  ; CHECK-LABEL: test_fmla_ss2S_0
-  ; CHECK: fmadd s0, s1, s2, s0
+; CHECK-LABEL: test_fmla_ss2S_0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmadd s0, s1, s2, s0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x float> %v, i32 0
   %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
   ret float %tmp2
 }
 
 define float @test_fmla_ss2S_0_swap(float %a, float %b, <2 x float> %v) {
-  ; CHECK-LABEL: test_fmla_ss2S_0_swap
-  ; CHECK: fmadd s0, s2, s1, s0
+; CHECK-LABEL: test_fmla_ss2S_0_swap:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x float> %v, i32 0
   %tmp2 = call float @llvm.fma.f32(float %tmp1, float %b, float %a)
   ret float %tmp2
 }
 
 define float @test_fmla_ss2S_1(float %a, float %b, <2 x float> %v) {
-  ; CHECK-LABEL: test_fmla_ss2S_1
-  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_fmla_ss2S_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmla s0, s1, v2.s[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
   ret float %tmp2
 }
 
+define float @test_fmla_ss4S_3_ext0(float %a, <4 x float> %v) {
+; CHECK-LABEL: test_fmla_ss4S_3_ext0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s2, v1.s[3]
+; CHECK-NEXT:    fmadd s0, s1, s2, s0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <4 x float> %v, i32 0
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = call float @llvm.fma.f32(float %tmp0, float %tmp1, float %a)
+  ret float %tmp2
+}
+
+define float @test_fmla_ss4S_3_ext0_swp(float %a, <4 x float> %v) {
+; CHECK-LABEL: test_fmla_ss4S_3_ext0_swp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s2, v1.s[3]
+; CHECK-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <4 x float> %v, i32 0
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = call float @llvm.fma.f32(float %tmp1, float %tmp0, float %a)
+  ret float %tmp2
+}
+
+define float @test_fmla_ss4S_0_ext0(float %a, <4 x float> %v, <4 x float> %w) {
+; CHECK-LABEL: test_fmla_ss4S_0_ext0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd s0, s1, s2, s0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <4 x float> %v, i32 0
+  %tmp1 = extractelement <4 x float> %w, i32 0
+  %tmp2 = call float @llvm.fma.f32(float %tmp0, float %tmp1, float %a)
+  ret float %tmp2
+}
+
+define float @test_fmla_ss2S_3_ext0(float %a, <2 x float> %v) {
+; CHECK-LABEL: test_fmla_ss2S_3_ext0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov s2, v1.s[1]
+; CHECK-NEXT:    fmadd s0, s1, s2, s0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <2 x float> %v, i32 0
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = call float @llvm.fma.f32(float %tmp0, float %tmp1, float %a)
+  ret float %tmp2
+}
+
+define float @test_fmla_ss2S_3_ext0_swp(float %a, <2 x float> %v) {
+; CHECK-LABEL: test_fmla_ss2S_3_ext0_swp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov s2, v1.s[1]
+; CHECK-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <2 x float> %v, i32 0
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = call float @llvm.fma.f32(float %tmp1, float %tmp0, float %a)
+  ret float %tmp2
+}
+
+define float @test_fmla_ss2S_0_ext0(float %a, <2 x float> %v, <2 x float> %w) {
+; CHECK-LABEL: test_fmla_ss2S_0_ext0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    fmadd s0, s1, s2, s0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <2 x float> %v, i32 0
+  %tmp1 = extractelement <2 x float> %w, i32 0
+  %tmp2 = call float @llvm.fma.f32(float %tmp0, float %tmp1, float %a)
+  ret float %tmp2
+}
+
 define double @test_fmla_ddD_0(double %a, double %b, <1 x double> %v) {
-  ; CHECK-LABEL: test_fmla_ddD_0
-  ; CHECK: fmadd d0, d1, d2, d0
+; CHECK-LABEL: test_fmla_ddD_0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd d0, d1, d2, d0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <1 x double> %v, i32 0
   %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a)
   ret double %tmp2
 }
 
 define double @test_fmla_ddD_0_swap(double %a, double %b, <1 x double> %v) {
-  ; CHECK-LABEL: test_fmla_ddD_0_swap
-  ; CHECK: fmadd d0, d2, d1, d0
+; CHECK-LABEL: test_fmla_ddD_0_swap:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd d0, d2, d1, d0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <1 x double> %v, i32 0
   %tmp2 = call double @llvm.fma.f64(double %tmp1, double %b, double %a)
   ret double %tmp2
 }
 
 define double @test_fmla_dd2D_0(double %a, double %b, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmla_dd2D_0
-  ; CHECK: fmadd d0, d1, d2, d0
+; CHECK-LABEL: test_fmla_dd2D_0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd d0, d1, d2, d0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 0
   %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a)
   ret double %tmp2
 }
 
 define double @test_fmla_dd2D_0_swap(double %a, double %b, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmla_dd2D_0_swap
-  ; CHECK: fmadd d0, d2, d1, d0
+; CHECK-LABEL: test_fmla_dd2D_0_swap:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd d0, d2, d1, d0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 0
   %tmp2 = call double @llvm.fma.f64(double %tmp1, double %b, double %a)
   ret double %tmp2
 }
 
 define double @test_fmla_dd2D_1(double %a, double %b, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmla_dd2D_1
-  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_fmla_dd2D_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla d0, d1, v2.d[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a)
   ret double %tmp2
 }
 
 define double @test_fmla_dd2D_1_swap(double %a, double %b, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmla_dd2D_1_swap
-  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_fmla_dd2D_1_swap:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla d0, d1, v2.d[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = call double @llvm.fma.f64(double %tmp1, double %b, double %a)
   ret double %tmp2
 }
 
+define double @test_fmla_ss2D_1_ext0(double %a, <2 x double> %v) {
+; CHECK-LABEL: test_fmla_ss2D_1_ext0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov d2, v1.d[1]
+; CHECK-NEXT:    fmadd d0, d1, d2, d0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <2 x double> %v, i32 0
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = call double @llvm.fma.f64(double %tmp0, double %tmp1, double %a)
+  ret double %tmp2
+}
+
+define double @test_fmla_ss2D_1_ext0_swp(double %a, <2 x double> %v) {
+; CHECK-LABEL: test_fmla_ss2D_1_ext0_swp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov d2, v1.d[1]
+; CHECK-NEXT:    fmadd d0, d2, d1, d0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <2 x double> %v, i32 0
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = call double @llvm.fma.f64(double %tmp1, double %tmp0, double %a)
+  ret double %tmp2
+}
+
+define double @test_fmla_ss2D_0_ext0(double %a, <2 x double> %v, <2 x double> %w) {
+; CHECK-LABEL: test_fmla_ss2D_0_ext0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd d0, d1, d2, d0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <2 x double> %v, i32 0
+  %tmp1 = extractelement <2 x double> %w, i32 0
+  %tmp2 = call double @llvm.fma.f64(double %tmp0, double %tmp1, double %a)
+  ret double %tmp2
+}
+
 define float @test_fmls_ss4S_0(float %a, float %b, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmls_ss4S_0
-  ; CHECK: fmsub s0, s2, s1, s0
+; CHECK-LABEL: test_fmls_ss4S_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg float %b
   %extract = extractelement <4 x float> %v, i64 0
@@ -122,8 +263,10 @@ entry:
 }
 
 define float @test_fmls_ss4S_0_swap(float %a, float %b, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmls_ss4S_0_swap
-  ; CHECK: fmsub s0, s2, s1, s0
+; CHECK-LABEL: test_fmls_ss4S_0_swap:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg float %b
   %extract = extractelement <4 x float> %v, i64 0
@@ -132,8 +275,11 @@ entry:
 }
 
 define float @test_fmls_ss4S_3(float %a, float %b, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmls_ss4S_3
-  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-LABEL: test_fmls_ss4S_3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s1, v2.s[3]
+; CHECK-NEXT:    fmls s0, s1, v2.s[3]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = fsub float -0.0, %tmp1
   %tmp3 = call float @llvm.fma.f32(float %tmp2, float %tmp1, float %a)
@@ -141,18 +287,23 @@ define float @test_fmls_ss4S_3(float %a, float %b, <4 x float> %v) {
 }
 
 define float @test_fmls_ss4S_3_swap(float %a, float %b, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmls_ss4S_3_swap
-  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-LABEL: test_fmls_ss4S_3_swap:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s1, v2.s[3]
+; CHECK-NEXT:    fmls s0, s1, v2.s[3]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = fsub float -0.0, %tmp1
   %tmp3 = call float @llvm.fma.f32(float %tmp1, float %tmp2, float %a)
   ret float %tmp3
 }
 
-
 define float @test_fmls_ss2S_0(float %a, float %b, <2 x float> %v) {
-  ; CHECK-LABEL: test_fmls_ss2S_0
-  ; CHECK: fmsub s0, s2, s1, s0
+; CHECK-LABEL: test_fmls_ss2S_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg float %b
   %extract = extractelement <2 x float> %v, i64 0
@@ -161,8 +312,11 @@ entry:
 }
 
 define float @test_fmls_ss2S_0_swap(float %a, float %b, <2 x float> %v) {
-  ; CHECK-LABEL: test_fmls_ss2S_0_swap
-  ; CHECK: fmsub s0, s2, s1, s0
+; CHECK-LABEL: test_fmls_ss2S_0_swap:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg float %b
   %extract = extractelement <2 x float> %v, i64 0
@@ -171,17 +325,48 @@ entry:
 }
 
 define float @test_fmls_ss2S_1(float %a, float %b, <2 x float> %v) {
-  ; CHECK-LABEL: test_fmls_ss2S_1
-  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_fmls_ss2S_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov s1, v2.s[1]
+; CHECK-NEXT:    fmls s0, s1, v2.s[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = fsub float -0.0, %tmp1
   %tmp3 = call float @llvm.fma.f32(float %tmp2, float %tmp1, float %a)
   ret float %tmp3
 }
 
+define float @test_fmls_ss4S_3_ext0(float %a, <4 x float> %v) {
+; CHECK-LABEL: test_fmls_ss4S_3_ext0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s2, v1.s[3]
+; CHECK-NEXT:    fmsub s0, s1, s2, s0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <4 x float> %v, i32 0
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = fsub float -0.0, %tmp1
+  %tmp3 = call float @llvm.fma.f32(float %tmp0, float %tmp2, float %a)
+  ret float %tmp3
+}
+
+define float @test_fmls_ss4S_0_ext0(float %a, <4 x float> %v, <4 x float> %w) {
+; CHECK-LABEL: test_fmls_ss4S_0_ext0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmsub s0, s1, s2, s0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <4 x float> %v, i32 0
+  %tmp1 = extractelement <4 x float> %w, i32 0
+  %tmp2 = fsub float -0.0, %tmp1
+  %tmp3 = call float @llvm.fma.f32(float %tmp0, float %tmp2, float %a)
+  ret float %tmp3
+}
+
 define double @test_fmls_ddD_0(double %a, double %b, <1 x double> %v) {
-  ; CHECK-LABEL: test_fmls_ddD_0
-  ; CHECK: fmsub d0, d1, d2, d0
+; CHECK-LABEL: test_fmls_ddD_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d1, d2, d0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg double %b
   %extract = extractelement <1 x double> %v, i64 0
@@ -190,8 +375,10 @@ entry:
 }
 
 define double @test_fmls_ddD_0_swap(double %a, double %b, <1 x double> %v) {
-  ; CHECK-LABEL: test_fmls_ddD_0_swap
-  ; CHECK: fmsub d0, d2, d1, d0
+; CHECK-LABEL: test_fmls_ddD_0_swap:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d2, d1, d0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg double %b
   %extract = extractelement <1 x double> %v, i64 0
@@ -200,8 +387,10 @@ entry:
 }
 
 define double @test_fmls_dd2D_0(double %a, double %b, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmls_dd2D_0
-  ; CHECK: fmsub d0, d2, d1, d0
+; CHECK-LABEL: test_fmls_dd2D_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d2, d1, d0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg double %b
   %extract = extractelement <2 x double> %v, i64 0
@@ -210,8 +399,10 @@ entry:
 }
 
 define double @test_fmls_dd2D_0_swap(double %a, double %b, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmls_dd2D_0_swap
-  ; CHECK: fmsub d0, d2, d1, d0
+; CHECK-LABEL: test_fmls_dd2D_0_swap:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d2, d1, d0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg double %b
   %extract = extractelement <2 x double> %v, i64 0
@@ -220,8 +411,11 @@ entry:
 }
 
 define double @test_fmls_dd2D_1(double %a, double %b, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmls_dd2D_1
-  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_fmls_dd2D_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov d1, v2.d[1]
+; CHECK-NEXT:    fmls d0, d1, v2.d[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = fsub double -0.0, %tmp1
   %tmp3 = call double @llvm.fma.f64(double %tmp2, double %tmp1, double %a)
@@ -229,121 +423,180 @@ define double @test_fmls_dd2D_1(double %a, double %b, <2 x double> %v) {
 }
 
 define double @test_fmls_dd2D_1_swap(double %a, double %b, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmls_dd2D_1_swap
-  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_fmls_dd2D_1_swap:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov d1, v2.d[1]
+; CHECK-NEXT:    fmls d0, d1, v2.d[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = fsub double -0.0, %tmp1
   %tmp3 = call double @llvm.fma.f64(double %tmp1, double %tmp2, double %a)
   ret double %tmp3
 }
 
+define double @test_fmls_dd2D_1_ext0(double %a, <2 x double> %v) {
+; CHECK-LABEL: test_fmls_dd2D_1_ext0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov d2, v1.d[1]
+; CHECK-NEXT:    fmsub d0, d1, d2, d0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <2 x double> %v, i32 0
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = fsub double -0.0, %tmp1
+  %tmp3 = call double @llvm.fma.f64(double %tmp2, double %tmp0, double %a)
+  ret double %tmp3
+}
+
+define double @test_fmls_dd2D_0_ext0(double %a, <2 x double> %v, <2 x double> %w) {
+; CHECK-LABEL: test_fmls_dd2D_0_ext0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmsub d0, d1, d2, d0
+; CHECK-NEXT:    ret
+  %tmp0 = extractelement <2 x double> %v, i32 0
+  %tmp1 = extractelement <2 x double> %w, i32 0
+  %tmp2 = fsub double -0.0, %tmp1
+  %tmp3 = call double @llvm.fma.f64(double %tmp2, double %tmp0, double %a)
+  ret double %tmp3
+}
+
 define float @test_fmla_ss4S_0_strict(float %a, float %b, <4 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmla_ss4S_0_strict
-  ; CHECK: fmadd s0, s1, s2, s0
+; CHECK-LABEL: test_fmla_ss4S_0_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd s0, s1, s2, s0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 0
   %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %b, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %tmp2
 }
 
 define float @test_fmla_ss4S_0_swap_strict(float %a, float %b, <4 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmla_ss4S_0_swap_strict
-  ; CHECK: fmadd s0, s2, s1, s0
+; CHECK-LABEL: test_fmla_ss4S_0_swap_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 0
   %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %tmp1, float %b, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %tmp2
 }
 
 define float @test_fmla_ss4S_3_strict(float %a, float %b, <4 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmla_ss4S_3_strict
-  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-LABEL: test_fmla_ss4S_3_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla s0, s1, v2.s[3]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %b, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %tmp2
 }
 
 define float @test_fmla_ss4S_3_swap_strict(float %a, float %b, <4 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmla_ss4S_3_swap_strict
-  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-LABEL: test_fmla_ss4S_3_swap_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla s0, s0, v2.s[3]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %tmp1, float %a, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %tmp2
 }
 
 define float @test_fmla_ss2S_0_strict(float %a, float %b, <2 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmla_ss2S_0_strict
-  ; CHECK: fmadd s0, s1, s2, s0
+; CHECK-LABEL: test_fmla_ss2S_0_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmadd s0, s1, s2, s0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x float> %v, i32 0
   %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %b, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %tmp2
 }
 
 define float @test_fmla_ss2S_0_swap_strict(float %a, float %b, <2 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmla_ss2S_0_swap_strict
-  ; CHECK: fmadd s0, s2, s1, s0
+; CHECK-LABEL: test_fmla_ss2S_0_swap_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x float> %v, i32 0
   %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %tmp1, float %b, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %tmp2
 }
 
 define float @test_fmla_ss2S_1_strict(float %a, float %b, <2 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmla_ss2S_1_strict
-  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_fmla_ss2S_1_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmla s0, s1, v2.s[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %b, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %tmp2
 }
 
 define double @test_fmla_ddD_0_strict(double %a, double %b, <1 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmla_ddD_0_strict
-  ; CHECK: fmadd d0, d1, d2, d0
+; CHECK-LABEL: test_fmla_ddD_0_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd d0, d1, d2, d0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <1 x double> %v, i32 0
   %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %b, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret double %tmp2
 }
 
 define double @test_fmla_ddD_0_swap_strict(double %a, double %b, <1 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmla_ddD_0_swap_strict
-  ; CHECK: fmadd d0, d2, d1, d0
+; CHECK-LABEL: test_fmla_ddD_0_swap_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd d0, d2, d1, d0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <1 x double> %v, i32 0
   %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %tmp1, double %b, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret double %tmp2
 }
 
 define double @test_fmla_dd2D_0_strict(double %a, double %b, <2 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmla_dd2D_0_strict
-  ; CHECK: fmadd d0, d1, d2, d0
+; CHECK-LABEL: test_fmla_dd2D_0_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd d0, d1, d2, d0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 0
   %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %b, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret double %tmp2
 }
 
 define double @test_fmla_dd2D_0_swap_strict(double %a, double %b, <2 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmla_dd2D_0_swap_strict
-  ; CHECK: fmadd d0, d2, d1, d0
+; CHECK-LABEL: test_fmla_dd2D_0_swap_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd d0, d2, d1, d0
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 0
   %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %tmp1, double %b, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret double %tmp2
 }
 
 define double @test_fmla_dd2D_1_strict(double %a, double %b, <2 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmla_dd2D_1_strict
-  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_fmla_dd2D_1_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla d0, d1, v2.d[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %b, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret double %tmp2
 }
 
 define double @test_fmla_dd2D_1_swap_strict(double %a, double %b, <2 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmla_dd2D_1_swap_strict
-  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_fmla_dd2D_1_swap_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla d0, d1, v2.d[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %tmp1, double %b, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret double %tmp2
 }
 
 define float @test_fmls_ss4S_0_strict(float %a, float %b, <4 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmls_ss4S_0_strict
-  ; CHECK: fmsub s0, s2, s1, s0
+; CHECK-LABEL: test_fmls_ss4S_0_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg float %b
   %extract = extractelement <4 x float> %v, i64 0
@@ -352,8 +605,10 @@ entry:
 }
 
 define float @test_fmls_ss4S_0_swap_strict(float %a, float %b, <4 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmls_ss4S_0_swap_strict
-  ; CHECK: fmsub s0, s2, s1, s0
+; CHECK-LABEL: test_fmls_ss4S_0_swap_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg float %b
   %extract = extractelement <4 x float> %v, i64 0
@@ -362,8 +617,11 @@ entry:
 }
 
 define float @test_fmls_ss4S_3_strict(float %a, float %b, <4 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmls_ss4S_3_strict
-  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-LABEL: test_fmls_ss4S_3_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s1, v2.s[3]
+; CHECK-NEXT:    fmls s0, s1, v2.s[3]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = fneg float %tmp1
   %tmp3 = call float @llvm.experimental.constrained.fma.f32(float %tmp2, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -371,8 +629,11 @@ define float @test_fmls_ss4S_3_strict(float %a, float %b, <4 x float> %v) #0 {
 }
 
 define float @test_fmls_ss4S_3_swap_strict(float %a, float %b, <4 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmls_ss4S_3_swap_strict
-  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-LABEL: test_fmls_ss4S_3_swap_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s1, v2.s[3]
+; CHECK-NEXT:    fmls s0, s1, v2.s[3]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = fneg float %tmp1
   %tmp3 = call float @llvm.experimental.constrained.fma.f32(float %tmp1, float %tmp2, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -380,8 +641,11 @@ define float @test_fmls_ss4S_3_swap_strict(float %a, float %b, <4 x float> %v) #
 }
 
 define float @test_fmls_ss2S_0_strict(float %a, float %b, <2 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmls_ss2S_0_strict
-  ; CHECK: fmsub s0, s2, s1, s0
+; CHECK-LABEL: test_fmls_ss2S_0_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg float %b
   %extract = extractelement <2 x float> %v, i64 0
@@ -390,8 +654,11 @@ entry:
 }
 
 define float @test_fmls_ss2S_0_swap_strict(float %a, float %b, <2 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmls_ss2S_0_swap_strict
-  ; CHECK: fmsub s0, s2, s1, s0
+; CHECK-LABEL: test_fmls_ss2S_0_swap_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmsub s0, s2, s1, s0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg float %b
   %extract = extractelement <2 x float> %v, i64 0
@@ -400,8 +667,12 @@ entry:
 }
 
 define float @test_fmls_ss2S_1_strict(float %a, float %b, <2 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmls_ss2S_1_strict
-  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_fmls_ss2S_1_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov s1, v2.s[1]
+; CHECK-NEXT:    fmls s0, s1, v2.s[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = fneg float %tmp1
   %tmp3 = call float @llvm.experimental.constrained.fma.f32(float %tmp2, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -409,8 +680,10 @@ define float @test_fmls_ss2S_1_strict(float %a, float %b, <2 x float> %v) #0 {
 }
 
 define double @test_fmls_ddD_0_strict(double %a, double %b, <1 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmls_ddD_0_strict
-  ; CHECK: fmsub d0, d2, d1, d0
+; CHECK-LABEL: test_fmls_ddD_0_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d2, d1, d0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg double %b
   %extract = extractelement <1 x double> %v, i64 0
@@ -419,8 +692,10 @@ entry:
 }
 
 define double @test_fmls_ddD_0_swap_strict(double %a, double %b, <1 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmls_ddD_0_swap_strict
-  ; CHECK: fmsub d0, d2, d1, d0
+; CHECK-LABEL: test_fmls_ddD_0_swap_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d2, d1, d0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg double %b
   %extract = extractelement <1 x double> %v, i64 0
@@ -429,8 +704,10 @@ entry:
 }
 
 define double @test_fmls_dd2D_0_strict(double %a, double %b, <2 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmls_dd2D_0_strict
-  ; CHECK: fmsub d0, d2, d1, d0
+; CHECK-LABEL: test_fmls_dd2D_0_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d2, d1, d0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg double %b
   %extract = extractelement <2 x double> %v, i64 0
@@ -439,8 +716,10 @@ entry:
 }
 
 define double @test_fmls_dd2D_0_swap_strict(double %a, double %b, <2 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmls_dd2D_0_swap_strict
-  ; CHECK: fmsub d0, d2, d1, d0
+; CHECK-LABEL: test_fmls_dd2D_0_swap_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d2, d1, d0
+; CHECK-NEXT:    ret
 entry:
   %fneg = fneg double %b
   %extract = extractelement <2 x double> %v, i64 0
@@ -449,8 +728,11 @@ entry:
 }
 
 define double @test_fmls_dd2D_1_strict(double %a, double %b, <2 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmls_dd2D_1_strict
-  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_fmls_dd2D_1_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov d1, v2.d[1]
+; CHECK-NEXT:    fmls d0, d1, v2.d[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = fneg double %tmp1
   %tmp3 = call double @llvm.experimental.constrained.fma.f64(double %tmp2, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -458,8 +740,11 @@ define double @test_fmls_dd2D_1_strict(double %a, double %b, <2 x double> %v) #0
 }
 
 define double @test_fmls_dd2D_1_swap_strict(double %a, double %b, <2 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmls_dd2D_1_swap_strict
-  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_fmls_dd2D_1_swap_strict:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov d1, v2.d[1]
+; CHECK-NEXT:    fmls d0, d1, v2.d[1]
+; CHECK-NEXT:    ret
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = fneg double %tmp1
   %tmp3 = call double @llvm.experimental.constrained.fma.f64(double %tmp1, double %tmp2, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
diff --git a/llvm/test/CodeGen/AArch64/srem-lkk.ll b/llvm/test/CodeGen/AArch64/srem-lkk.ll
index d9f9144..1223ae3 100644
--- a/llvm/test/CodeGen/AArch64/srem-lkk.ll
+++ b/llvm/test/CodeGen/AArch64/srem-lkk.ll
@@ -23,12 +23,11 @@ define i32 @fold_srem_positive_even(i32 %x) {
 ; CHECK-LABEL: fold_srem_positive_even:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #36849 // =0x8ff1
+; CHECK-NEXT:    mov w9, #1060 // =0x424
 ; CHECK-NEXT:    movk w8, #15827, lsl #16
 ; CHECK-NEXT:    smull x8, w0, w8
-; CHECK-NEXT:    lsr x9, x8, #63
 ; CHECK-NEXT:    asr x8, x8, #40
-; CHECK-NEXT:    add w8, w8, w9
-; CHECK-NEXT:    mov w9, #1060 // =0x424
+; CHECK-NEXT:    add w8, w8, w8, lsr #31
 ; CHECK-NEXT:    msub w0, w8, w9, w0
 ; CHECK-NEXT:    ret
   %1 = srem i32 %x, 1060
@@ -40,12 +39,11 @@ define i32 @fold_srem_negative_odd(i32 %x) {
 ; CHECK-LABEL: fold_srem_negative_odd:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #65445 // =0xffa5
+; CHECK-NEXT:    mov w9, #-723 // =0xfffffd2d
 ; CHECK-NEXT:    movk w8, #42330, lsl #16
 ; CHECK-NEXT:    smull x8, w0, w8
-; CHECK-NEXT:    lsr x9, x8, #63
 ; CHECK-NEXT:    asr x8, x8, #40
-; CHECK-NEXT:    add w8, w8, w9
-; CHECK-NEXT:    mov w9, #-723 // =0xfffffd2d
+; CHECK-NEXT:    add w8, w8, w8, lsr #31
 ; CHECK-NEXT:    msub w0, w8, w9, w0
 ; CHECK-NEXT:    ret
   %1 = srem i32 %x, -723
@@ -57,12 +55,11 @@ define i32 @fold_srem_negative_even(i32 %x) {
 ; CHECK-LABEL: fold_srem_negative_even:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #62439 // =0xf3e7
+; CHECK-NEXT:    mov w9, #-22981 // =0xffffa63b
 ; CHECK-NEXT:    movk w8, #64805, lsl #16
 ; CHECK-NEXT:    smull x8, w0, w8
-; CHECK-NEXT:    lsr x9, x8, #63
 ; CHECK-NEXT:    asr x8, x8, #40
-; CHECK-NEXT:    add w8, w8, w9
-; CHECK-NEXT:    mov w9, #-22981 // =0xffffa63b
+; CHECK-NEXT:    add w8, w8, w8, lsr #31
 ; CHECK-NEXT:    msub w0, w8, w9, w0
 ; CHECK-NEXT:    ret
   %1 = srem i32 %x, -22981
diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
index 9fbce05e..884d668 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
@@ -25,8 +25,8 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sbfx w8, w0, #0, #4
 ; CHECK-NEXT:    add w8, w8, w8, lsl #1
-; CHECK-NEXT:    ubfx w9, w8, #7, #1
-; CHECK-NEXT:    add w8, w9, w8, lsr #4
+; CHECK-NEXT:    lsr w9, w8, #4
+; CHECK-NEXT:    add w8, w9, w8, lsr #31
 ; CHECK-NEXT:    mov w9, #6 // =0x6
 ; CHECK-NEXT:    msub w8, w8, w9, w0
 ; CHECK-NEXT:    and w8, w8, #0xf
diff --git a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll
index a74f0c8..b165ac0 100644
--- a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll
@@ -263,16 +263,14 @@ define <2 x i32> @fold_srem_v2i32(<2 x i32> %x) {
 ; CHECK-LABEL: fold_srem_v2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #26215 // =0x6667
-; CHECK-NEXT:    movi v3.2s, #10
+; CHECK-NEXT:    movi v2.2s, #10
 ; CHECK-NEXT:    movk w8, #26214, lsl #16
 ; CHECK-NEXT:    dup v1.2s, w8
 ; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    ushr v2.2d, v1.2d, #63
 ; CHECK-NEXT:    sshr v1.2d, v1.2d, #34
-; CHECK-NEXT:    xtn v2.2s, v2.2d
 ; CHECK-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEXT:    add v1.2s, v1.2s, v2.2s
-; CHECK-NEXT:    mls v0.2s, v1.2s, v3.2s
+; CHECK-NEXT:    usra v1.2s, v1.2s, #31
+; CHECK-NEXT:    mls v0.2s, v1.2s, v2.2s
 ; CHECK-NEXT:    ret
   %1 = srem <2 x i32> %x, <i32 10, i32 10>
   ret <2 x i32> %1
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir
index f6b0405..fc6cd74 100644
--- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir
@@ -1828,3 +1828,211 @@ body:             |
     SI_RETURN implicit $vgpr0, implicit $sgpr4_sgpr5
 
 ...
+
+---
+name: v_add_co_u32_e64_fi_sgpr_clobbered_register
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 32768, alignment: 4, local-offset: 0 }
+  - { id: 1, size: 32768, alignment: 4, local-offset: 32768 }
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C
+
+    ; GFX7-LABEL: name: v_add_co_u32_e64_fi_sgpr_clobbered_register
+    ; GFX7: liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX7-NEXT: {{  $}}
+    ; GFX7-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX7-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $noreg, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+    ; GFX7-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+    ; GFX7-NEXT: renamable $sgpr0 = S_LSHL_B32 renamable $sgpr6, 2, implicit-def dead $scc
+    ; GFX7-NEXT: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec
+    ; GFX7-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $sgpr0, 0, implicit $exec
+    ; GFX7-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
+    ; GFX7-NEXT: S_ENDPGM 0
+    ;
+    ; GFX8-LABEL: name: v_add_co_u32_e64_fi_sgpr_clobbered_register
+    ; GFX8: liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX8-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $noreg, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+    ; GFX8-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+    ; GFX8-NEXT: renamable $sgpr0 = S_LSHL_B32 renamable $sgpr6, 2, implicit-def dead $scc
+    ; GFX8-NEXT: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec
+    ; GFX8-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $sgpr0, 0, implicit $exec
+    ; GFX8-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
+    ; GFX8-NEXT: S_ENDPGM 0
+    ;
+    ; GFX900-LABEL: name: v_add_co_u32_e64_fi_sgpr_clobbered_register
+    ; GFX900: liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX900-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $noreg, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+    ; GFX900-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+    ; GFX900-NEXT: renamable $sgpr0 = S_LSHL_B32 renamable $sgpr6, 2, implicit-def dead $scc
+    ; GFX900-NEXT: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec
+    ; GFX900-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $sgpr0, 0, implicit $exec
+    ; GFX900-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
+    ; GFX900-NEXT: S_ENDPGM 0
+    ;
+    ; GFX90A-LABEL: name: v_add_co_u32_e64_fi_sgpr_clobbered_register
+    ; GFX90A: liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX90A-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $noreg, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+    ; GFX90A-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+    ; GFX90A-NEXT: renamable $sgpr0 = S_LSHL_B32 renamable $sgpr6, 2, implicit-def dead $scc
+    ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $sgpr0, 0, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
+    ; GFX90A-NEXT: S_ENDPGM 0
+    ;
+    ; GFX10-LABEL: name: v_add_co_u32_e64_fi_sgpr_clobbered_register
+    ; GFX10: liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: $sgpr96_sgpr97_sgpr98_sgpr99 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX10-NEXT: $sgpr96 = S_ADD_U32 $sgpr96, $noreg, implicit-def $scc, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99
+    ; GFX10-NEXT: $sgpr97 = S_ADDC_U32 $sgpr97, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99
+    ; GFX10-NEXT: renamable $sgpr0 = S_LSHL_B32 renamable $sgpr6, 2, implicit-def dead $scc
+    ; GFX10-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 32772, killed $sgpr0, 0, implicit $exec
+    ; GFX10-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
+    ; GFX10-NEXT: S_ENDPGM 0
+    ;
+    ; GFX940-LABEL: name: v_add_co_u32_e64_fi_sgpr_clobbered_register
+    ; GFX940: liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: renamable $sgpr0 = S_LSHL_B32 renamable $sgpr6, 2, implicit-def dead $scc
+    ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec
+    ; GFX940-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $sgpr0, 0, implicit $exec
+    ; GFX940-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
+    ; GFX940-NEXT: S_ENDPGM 0
+    ;
+    ; GFX11-LABEL: name: v_add_co_u32_e64_fi_sgpr_clobbered_register
+    ; GFX11: liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: renamable $sgpr0 = S_LSHL_B32 renamable $sgpr6, 2, implicit-def dead $scc
+    ; GFX11-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 32772, killed $sgpr0, 0, implicit $exec
+    ; GFX11-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
+    ; GFX11-NEXT: S_ENDPGM 0
+    ;
+    ; GFX12-LABEL: name: v_add_co_u32_e64_fi_sgpr_clobbered_register
+    ; GFX12: liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: renamable $sgpr0 = S_LSHL_B32 renamable $sgpr6, 2, implicit-def dead $scc
+    ; GFX12-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 32768, killed $sgpr0, 0, implicit $exec
+    ; GFX12-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
+    ; GFX12-NEXT: S_ENDPGM 0
+    renamable $sgpr0 = S_LSHL_B32 renamable $sgpr6, 2, implicit-def dead $scc
+    renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 %stack.1, killed $sgpr0, 0, implicit $exec
+    renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
+    S_ENDPGM 0
+
+...
+
+---
+name: v_add_co_u32_e64_sgpr_fi_clobbered_register
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 32768, alignment: 4, local-offset: 0 }
+  - { id: 1, size: 32768, alignment: 4, local-offset: 32768 }
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C
+
+    ; GFX7-LABEL: name: v_add_co_u32_e64_sgpr_fi_clobbered_register
+    ; GFX7: liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX7-NEXT: {{  $}}
+    ; GFX7-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX7-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $noreg, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+    ; GFX7-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+    ; GFX7-NEXT: renamable $sgpr0 = S_LSHL_B32 renamable $sgpr6, 2, implicit-def dead $scc
+    ; GFX7-NEXT: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec
+    ; GFX7-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $sgpr0, 0, implicit $exec
+    ; GFX7-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
+    ; GFX7-NEXT: S_ENDPGM 0
+    ;
+    ; GFX8-LABEL: name: v_add_co_u32_e64_sgpr_fi_clobbered_register
+    ; GFX8: liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX8-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $noreg, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+    ; GFX8-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+    ; GFX8-NEXT: renamable $sgpr0 = S_LSHL_B32 renamable $sgpr6, 2, implicit-def dead $scc
+    ; GFX8-NEXT: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec
+    ; GFX8-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $sgpr0, 0, implicit $exec
+    ; GFX8-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
+    ; GFX8-NEXT: S_ENDPGM 0
+    ;
+    ; GFX900-LABEL: name: v_add_co_u32_e64_sgpr_fi_clobbered_register
+    ; GFX900: liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX900-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $noreg, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+    ; GFX900-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+    ; GFX900-NEXT: renamable $sgpr0 = S_LSHL_B32 renamable $sgpr6, 2, implicit-def dead $scc
+    ; GFX900-NEXT: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec
+    ; GFX900-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $sgpr0, 0, implicit $exec
+    ; GFX900-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
+    ; GFX900-NEXT: S_ENDPGM 0
+    ;
+    ; GFX90A-LABEL: name: v_add_co_u32_e64_sgpr_fi_clobbered_register
+    ; GFX90A: liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX90A-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $noreg, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+    ; GFX90A-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+    ; GFX90A-NEXT: renamable $sgpr0 = S_LSHL_B32 renamable $sgpr6, 2, implicit-def dead $scc
+    ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $sgpr0, 0, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
+    ; GFX90A-NEXT: S_ENDPGM 0
+    ;
+    ; GFX10-LABEL: name: v_add_co_u32_e64_sgpr_fi_clobbered_register
+    ; GFX10: liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: $sgpr96_sgpr97_sgpr98_sgpr99 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX10-NEXT: $sgpr96 = S_ADD_U32 $sgpr96, $noreg, implicit-def $scc, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99
+    ; GFX10-NEXT: $sgpr97 = S_ADDC_U32 $sgpr97, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99
+    ; GFX10-NEXT: renamable $sgpr0 = S_LSHL_B32 renamable $sgpr6, 2, implicit-def dead $scc
+    ; GFX10-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 32772, killed $sgpr0, 0, implicit $exec
+    ; GFX10-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
+    ; GFX10-NEXT: S_ENDPGM 0
+    ;
+    ; GFX940-LABEL: name: v_add_co_u32_e64_sgpr_fi_clobbered_register
+    ; GFX940: liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: renamable $sgpr0 = S_LSHL_B32 renamable $sgpr6, 2, implicit-def dead $scc
+    ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec
+    ; GFX940-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, killed $sgpr0, 0, implicit $exec
+    ; GFX940-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
+    ; GFX940-NEXT: S_ENDPGM 0
+    ;
+    ; GFX11-LABEL: name: v_add_co_u32_e64_sgpr_fi_clobbered_register
+    ; GFX11: liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: renamable $sgpr0 = S_LSHL_B32 renamable $sgpr6, 2, implicit-def dead $scc
+    ; GFX11-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 32772, killed $sgpr0, 0, implicit $exec
+    ; GFX11-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
+    ; GFX11-NEXT: S_ENDPGM 0
+    ;
+    ; GFX12-LABEL: name: v_add_co_u32_e64_sgpr_fi_clobbered_register
+    ; GFX12: liveins: $sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000003C
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: renamable $sgpr0 = S_LSHL_B32 renamable $sgpr6, 2, implicit-def dead $scc
+    ; GFX12-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 32768, killed $sgpr0, 0, implicit $exec
+    ; GFX12-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
+    ; GFX12-NEXT: S_ENDPGM 0
+    renamable $sgpr0 = S_LSHL_B32 renamable $sgpr6, 2, implicit-def dead $scc
+    renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 %stack.1, killed $sgpr0, 0, implicit $exec
+    renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir
index 7c25e5f..9c2fef0 100644
--- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir
@@ -1220,9 +1220,8 @@ body:             |
     ; MUBUF-NEXT: {{  $}}
     ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
     ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
-    ; MUBUF-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec
     ; MUBUF-NEXT: $vgpr1 = V_MOV_B32_e32 72, implicit $exec
-    ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e64 killed $vgpr1, killed $vgpr1, 0, implicit $exec
+    ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e64 killed $vgpr1, $sgpr8, 0, implicit $exec
     ; MUBUF-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8
     ;
     ; MUBUFW32-LABEL: name: v_add_u32_e64__kernel_fi_offset72__sgpr_live_after
@@ -1236,9 +1235,8 @@ body:             |
     ; FLATSCRW64-LABEL: name: v_add_u32_e64__kernel_fi_offset72__sgpr_live_after
     ; FLATSCRW64: liveins: $sgpr8
     ; FLATSCRW64-NEXT: {{  $}}
-    ; FLATSCRW64-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec
     ; FLATSCRW64-NEXT: $vgpr1 = V_MOV_B32_e32 72, implicit $exec
-    ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e64 killed $vgpr1, killed $vgpr1, 0, implicit $exec
+    ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e64 killed $vgpr1, $sgpr8, 0, implicit $exec
     ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8
     ;
     ; FLATSCRW32-LABEL: name: v_add_u32_e64__kernel_fi_offset72__sgpr_live_after
diff --git a/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll
index 7f56215..9733624 100644
--- a/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll
@@ -115,11 +115,10 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; ARM5-LABEL: test_srem_even:
 ; ARM5:       @ %bb.0:
 ; ARM5-NEXT:    lsl r1, r0, #28
-; ARM5-NEXT:    mov r2, #1
 ; ARM5-NEXT:    asr r1, r1, #28
 ; ARM5-NEXT:    add r1, r1, r1, lsl #1
-; ARM5-NEXT:    and r2, r2, r1, lsr #7
-; ARM5-NEXT:    add r1, r2, r1, lsr #4
+; ARM5-NEXT:    lsr r2, r1, #4
+; ARM5-NEXT:    add r1, r2, r1, lsr #31
 ; ARM5-NEXT:    add r1, r1, r1, lsl #1
 ; ARM5-NEXT:    sub r0, r0, r1, lsl #1
 ; ARM5-NEXT:    and r0, r0, #15
@@ -131,11 +130,10 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; ARM6-LABEL: test_srem_even:
 ; ARM6:       @ %bb.0:
 ; ARM6-NEXT:    lsl r1, r0, #28
-; ARM6-NEXT:    mov r2, #1
 ; ARM6-NEXT:    asr r1, r1, #28
 ; ARM6-NEXT:    add r1, r1, r1, lsl #1
-; ARM6-NEXT:    and r2, r2, r1, lsr #7
-; ARM6-NEXT:    add r1, r2, r1, lsr #4
+; ARM6-NEXT:    lsr r2, r1, #4
+; ARM6-NEXT:    add r1, r2, r1, lsr #31
 ; ARM6-NEXT:    add r1, r1, r1, lsl #1
 ; ARM6-NEXT:    sub r0, r0, r1, lsl #1
 ; ARM6-NEXT:    and r0, r0, #15
@@ -148,8 +146,8 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; ARM7:       @ %bb.0:
 ; ARM7-NEXT:    sbfx r1, r0, #0, #4
 ; ARM7-NEXT:    add r1, r1, r1, lsl #1
-; ARM7-NEXT:    ubfx r2, r1, #7, #1
-; ARM7-NEXT:    add r1, r2, r1, lsr #4
+; ARM7-NEXT:    lsr r2, r1, #4
+; ARM7-NEXT:    add r1, r2, r1, lsr #31
 ; ARM7-NEXT:    add r1, r1, r1, lsl #1
 ; ARM7-NEXT:    sub r0, r0, r1, lsl #1
 ; ARM7-NEXT:    and r0, r0, #15
@@ -162,8 +160,8 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; ARM8:       @ %bb.0:
 ; ARM8-NEXT:    sbfx r1, r0, #0, #4
 ; ARM8-NEXT:    add r1, r1, r1, lsl #1
-; ARM8-NEXT:    ubfx r2, r1, #7, #1
-; ARM8-NEXT:    add r1, r2, r1, lsr #4
+; ARM8-NEXT:    lsr r2, r1, #4
+; ARM8-NEXT:    add r1, r2, r1, lsr #31
 ; ARM8-NEXT:    add r1, r1, r1, lsl #1
 ; ARM8-NEXT:    sub r0, r0, r1, lsl #1
 ; ARM8-NEXT:    and r0, r0, #15
@@ -176,8 +174,8 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; NEON7:       @ %bb.0:
 ; NEON7-NEXT:    sbfx r1, r0, #0, #4
 ; NEON7-NEXT:    add r1, r1, r1, lsl #1
-; NEON7-NEXT:    ubfx r2, r1, #7, #1
-; NEON7-NEXT:    add r1, r2, r1, lsr #4
+; NEON7-NEXT:    lsr r2, r1, #4
+; NEON7-NEXT:    add r1, r2, r1, lsr #31
 ; NEON7-NEXT:    add r1, r1, r1, lsl #1
 ; NEON7-NEXT:    sub r0, r0, r1, lsl #1
 ; NEON7-NEXT:    and r0, r0, #15
@@ -190,8 +188,8 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; NEON8:       @ %bb.0:
 ; NEON8-NEXT:    sbfx r1, r0, #0, #4
 ; NEON8-NEXT:    add r1, r1, r1, lsl #1
-; NEON8-NEXT:    ubfx r2, r1, #7, #1
-; NEON8-NEXT:    add r1, r2, r1, lsr #4
+; NEON8-NEXT:    lsr r2, r1, #4
+; NEON8-NEXT:    add r1, r2, r1, lsr #31
 ; NEON8-NEXT:    add r1, r1, r1, lsl #1
 ; NEON8-NEXT:    sub r0, r0, r1, lsl #1
 ; NEON8-NEXT:    and r0, r0, #15
diff --git a/llvm/test/CodeGen/LoongArch/fp-rounding.ll b/llvm/test/CodeGen/LoongArch/fp-rounding.ll
new file mode 100644
index 0000000..19c4e3f
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/fp-rounding.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=-lsx < %s | FileCheck %s --check-prefix=NOLSX
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefix=LSX
+
+;; ceilf
+define float @ceil_f32(float %i) nounwind {
+; NOLSX-LABEL: ceil_f32:
+; NOLSX:       # %bb.0: # %entry
+; NOLSX-NEXT:    b %plt(ceilf)
+;
+; LSX-LABEL: ceil_f32:
+; LSX:       # %bb.0: # %entry
+; LSX-NEXT:    b %plt(ceilf)
+entry:
+  %0 = call float @llvm.ceil.f32(float %i)
+  ret float %0
+}
+
+;; ceil
+define double @ceil_f64(double %i) nounwind {
+; NOLSX-LABEL: ceil_f64:
+; NOLSX:       # %bb.0: # %entry
+; NOLSX-NEXT:    b %plt(ceil)
+;
+; LSX-LABEL: ceil_f64:
+; LSX:       # %bb.0: # %entry
+; LSX-NEXT:    b %plt(ceil)
+entry:
+  %0 = call double @llvm.ceil.f64(double %i)
+  ret double %0
+}
+
+;; floorf
+define float @floor_f32(float %i) nounwind {
+; NOLSX-LABEL: floor_f32:
+; NOLSX:       # %bb.0: # %entry
+; NOLSX-NEXT:    b %plt(floorf)
+;
+; LSX-LABEL: floor_f32:
+; LSX:       # %bb.0: # %entry
+; LSX-NEXT:    b %plt(floorf)
+entry:
+  %0 = call float @llvm.floor.f32(float %i)
+  ret float %0
+}
+
+;; floor
+define double @floor_f64(double %i) nounwind {
+; NOLSX-LABEL: floor_f64:
+; NOLSX:       # %bb.0: # %entry
+; NOLSX-NEXT:    b %plt(floor)
+;
+; LSX-LABEL: floor_f64:
+; LSX:       # %bb.0: # %entry
+; LSX-NEXT:    b %plt(floor)
+entry:
+  %0 = call double @llvm.floor.f64(double %i)
+  ret double %0
+}
+
+;; truncf
+define float @trunc_f32(float %i) nounwind {
+; NOLSX-LABEL: trunc_f32:
+; NOLSX:       # %bb.0: # %entry
+; NOLSX-NEXT:    b %plt(truncf)
+;
+; LSX-LABEL: trunc_f32:
+; LSX:       # %bb.0: # %entry
+; LSX-NEXT:    b %plt(truncf)
+entry:
+  %0 = call float @llvm.trunc.f32(float %i)
+  ret float %0
+}
+
+;; trunc
+define double @trunc_f64(double %i) nounwind {
+; NOLSX-LABEL: trunc_f64:
+; NOLSX:       # %bb.0: # %entry
+; NOLSX-NEXT:    b %plt(trunc)
+;
+; LSX-LABEL: trunc_f64:
+; LSX:       # %bb.0: # %entry
+; LSX-NEXT:    b %plt(trunc)
+entry:
+  %0 = call double @llvm.trunc.f64(double %i)
+  ret double %0
+}
+
+;; roundevenf
+define float @roundeven_f32(float %i) nounwind {
+; NOLSX-LABEL: roundeven_f32:
+; NOLSX:       # %bb.0: # %entry
+; NOLSX-NEXT:    b %plt(roundevenf)
+;
+; LSX-LABEL: roundeven_f32:
+; LSX:       # %bb.0: # %entry
+; LSX-NEXT:    b %plt(roundevenf)
+entry:
+  %0 = call float @llvm.roundeven.f32(float %i)
+  ret float %0
+}
+
+;; roundeven
+define double @roundeven_f64(double %i) nounwind {
+; NOLSX-LABEL: roundeven_f64:
+; NOLSX:       # %bb.0: # %entry
+; NOLSX-NEXT:    b %plt(roundeven)
+;
+; LSX-LABEL: roundeven_f64:
+; LSX:       # %bb.0: # %entry
+; LSX-NEXT:    b %plt(roundeven)
+entry:
+  %0 = call double @llvm.roundeven.f64(double %i)
+  ret double %0
+}
+
+declare float @llvm.ceil.f32(float)
+declare double @llvm.ceil.f64(double)
+declare float @llvm.floor.f32(float)
+declare double @llvm.floor.f64(double)
+declare float @llvm.trunc.f32(float)
+declare double @llvm.trunc.f64(double)
+declare float @llvm.roundeven.f32(float)
+declare double @llvm.roundeven.f64(double)
diff --git a/llvm/test/CodeGen/LoongArch/merge-load-store.ll b/llvm/test/CodeGen/LoongArch/merge-load-store.ll
new file mode 100644
index 0000000..2eac65e
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/merge-load-store.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --verify-machineinstrs < %s \
+; RUN:   | FileCheck --check-prefix=LA32 %s
+; RUN: llc --mtriple=loongarch64 --verify-machineinstrs < %s \
+; RUN:   | FileCheck --check-prefix=LA64 %s
+
+define void @merge_load_store(ptr noalias nocapture noundef readonly align 1 dereferenceable(8) %src, ptr noalias nocapture noundef writeonly align 1 dereferenceable(8) %dst) unnamed_addr {
+; LA32-LABEL: merge_load_store:
+; LA32:       # %bb.0: # %start
+; LA32-NEXT:    ld.b $a2, $a0, 0
+; LA32-NEXT:    ld.b $a3, $a0, 1
+; LA32-NEXT:    ld.b $a4, $a0, 2
+; LA32-NEXT:    ld.b $a5, $a0, 3
+; LA32-NEXT:    st.b $a2, $a1, 0
+; LA32-NEXT:    st.b $a3, $a1, 1
+; LA32-NEXT:    st.b $a4, $a1, 2
+; LA32-NEXT:    st.b $a5, $a1, 3
+; LA32-NEXT:    ld.b $a2, $a0, 4
+; LA32-NEXT:    ld.b $a3, $a0, 5
+; LA32-NEXT:    ld.b $a4, $a0, 6
+; LA32-NEXT:    ld.b $a0, $a0, 7
+; LA32-NEXT:    st.b $a2, $a1, 4
+; LA32-NEXT:    st.b $a3, $a1, 5
+; LA32-NEXT:    st.b $a4, $a1, 6
+; LA32-NEXT:    st.b $a0, $a1, 7
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: merge_load_store:
+; LA64:       # %bb.0: # %start
+; LA64-NEXT:    ld.b $a2, $a0, 0
+; LA64-NEXT:    ld.b $a3, $a0, 1
+; LA64-NEXT:    ld.b $a4, $a0, 2
+; LA64-NEXT:    ld.b $a5, $a0, 3
+; LA64-NEXT:    st.b $a2, $a1, 0
+; LA64-NEXT:    st.b $a3, $a1, 1
+; LA64-NEXT:    st.b $a4, $a1, 2
+; LA64-NEXT:    st.b $a5, $a1, 3
+; LA64-NEXT:    ld.b $a2, $a0, 4
+; LA64-NEXT:    ld.b $a3, $a0, 5
+; LA64-NEXT:    ld.b $a4, $a0, 6
+; LA64-NEXT:    ld.b $a0, $a0, 7
+; LA64-NEXT:    st.b $a2, $a1, 4
+; LA64-NEXT:    st.b $a3, $a1, 5
+; LA64-NEXT:    st.b $a4, $a1, 6
+; LA64-NEXT:    st.b $a0, $a1, 7
+; LA64-NEXT:    ret
+start:
+  %_3 = load i8, ptr %src, align 1
+  store i8 %_3, ptr %dst, align 1
+  %0 = getelementptr inbounds i8, ptr %src, i64 1
+  %_4 = load i8, ptr %0, align 1
+  %1 = getelementptr inbounds i8, ptr %dst, i64 1
+  store i8 %_4, ptr %1, align 1
+  %2 = getelementptr inbounds i8, ptr %src, i64 2
+  %_5 = load i8, ptr %2, align 1
+  %3 = getelementptr inbounds i8, ptr %dst, i64 2
+  store i8 %_5, ptr %3, align 1
+  %4 = getelementptr inbounds i8, ptr %src, i64 3
+  %_6 = load i8, ptr %4, align 1
+  %5 = getelementptr inbounds i8, ptr %dst, i64 3
+  store i8 %_6, ptr %5, align 1
+  %6 = getelementptr inbounds i8, ptr %src, i64 4
+  %_7 = load i8, ptr %6, align 1
+  %7 = getelementptr inbounds i8, ptr %dst, i64 4
+  store i8 %_7, ptr %7, align 1
+  %8 = getelementptr inbounds i8, ptr %src, i64 5
+  %_8 = load i8, ptr %8, align 1
+  %9 = getelementptr inbounds i8, ptr %dst, i64 5
+  store i8 %_8, ptr %9, align 1
+  %10 = getelementptr inbounds i8, ptr %src, i64 6
+  %_9 = load i8, ptr %10, align 1
+  %11 = getelementptr inbounds i8, ptr %dst, i64 6
+  store i8 %_9, ptr %11, align 1
+  %12 = getelementptr inbounds i8, ptr %src, i64 7
+  %_10 = load i8, ptr %12, align 1
+  %13 = getelementptr inbounds i8, ptr %dst, i64 7
+  store i8 %_10, ptr %13, align 1
+  ret void
+}
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs-invalid.mir b/llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs-invalid.mir
new file mode 100644
index 0000000..733b352
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs-invalid.mir
@@ -0,0 +1,12 @@
+# RUN: not llc -mtriple=amdgcn-amd-amdhsa -run-pass=none -o /dev/null %s 2>&1 | FileCheck %s --check-prefix=ERR
+
+---
+name: invalid_reg_spill_phys_vgprs
+machineFunctionInfo:
+# ERR: [[@LINE+1]]:21: unknown register name 'notareg'
+  spillPhysVGPRs: ['$notareg']
+body: |
+  bb.0:
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs-not-a-reg.mir b/llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs-not-a-reg.mir
new file mode 100644
index 0000000..7275d3c
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs-not-a-reg.mir
@@ -0,0 +1,12 @@
+# RUN: not llc -mtriple=amdgcn-amd-amdhsa -run-pass=none -o /dev/null %s 2>&1 | FileCheck %s --check-prefix=ERR
+
+---
+name: invalid_reg_spill_phys_vgprs
+machineFunctionInfo:
+# ERR: [[@LINE+1]]:20: expected a named register
+  spillPhysVGPRs: [123]
+body: |
+  bb.0:
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs.mir b/llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs.mir
new file mode 100644
index 0000000..4d6e33c
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/spill-phys-vgprs.mir
@@ -0,0 +1,63 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa --start-before=si-lower-sgpr-spills --stop-after=prologepilog -o - %s | FileCheck %s
+
+# CHECK: csr_sgpr_spill
+# CHECK: spillPhysVGPRs
+# CHECK-NEXT: - '$vgpr0'
+---
+name: csr_sgpr_spill
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: false
+  scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+  hasSpilledSGPRs: true
+body: |
+  bb.0:
+    S_NOP 0
+  bb.1:
+    $sgpr40 = S_MOV_B32 0
+    $sgpr41 = S_MOV_B32 1
+
+...
+
+# CHECK-LABEL: name: parse_none
+# CHECK: machineFunctionInfo:
+# CHECK-NOT: spillPhysVGPRs
+---
+name: parse_none
+machineFunctionInfo:
+  spillPhysVGPRs: []
+body: |
+  bb.0:
+    S_ENDPGM 0
+
+...
+
+# CHECK-LABEL: name: parse_one
+# CHECK: machineFunctionInfo:
+# CHECK: spillPhysVGPRs
+# CHECK-NEXT: - '$vgpr0'
+---
+name: parse_one
+machineFunctionInfo:
+  spillPhysVGPRs: ['$vgpr0']
+body: |
+  bb.0:
+    S_ENDPGM 0
+
+...
+
+# CHECK-LABEL: name: parse_two
+# CHECK: machineFunctionInfo:
+# CHECK: spillPhysVGPRs
+# CHECK-NEXT: - '$vgpr0'
+# CHECK-NEXT: - '$vgpr1'
+---
+name: parse_two
+machineFunctionInfo:
+  spillPhysVGPRs: ['$vgpr0', '$vgpr1']
+body: |
+  bb.0:
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/Mips/fp16-promote.ll b/llvm/test/CodeGen/Mips/fp16-promote.ll
index 348cf38..c104ffb 100644
--- a/llvm/test/CodeGen/Mips/fp16-promote.ll
+++ b/llvm/test/CodeGen/Mips/fp16-promote.ll
@@ -4,27 +4,25 @@
 define void @test_fadd(ptr %p, ptr %q) nounwind {
 ; CHECK-LIBCALL-LABEL: test_fadd:
 ; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, -40
-; CHECK-LIBCALL-NEXT:    sdc1 $f20, 32($sp) # 8-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $17, 24($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    sw $16, 20($sp) # 4-byte Folded Spill
-; CHECK-LIBCALL-NEXT:    move $17, $4
-; CHECK-LIBCALL-NEXT:    lhu $4, 0($4)
+; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, -32
+; CHECK-LIBCALL-NEXT:    sdc1 $f20, 24($sp) # 8-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    move $16, $4
+; CHECK-LIBCALL-NEXT:    lhu $4, 0($5)
 ; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    move $16, $5
+; CHECK-LIBCALL-NEXT:    nop
 ; CHECK-LIBCALL-NEXT:    lhu $4, 0($16)
 ; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
 ; CHECK-LIBCALL-NEXT:    mov.s $f20, $f0
 ; CHECK-LIBCALL-NEXT:    jal __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT:    add.s $f12, $f20, $f0
-; CHECK-LIBCALL-NEXT:    sh $2, 0($17)
-; CHECK-LIBCALL-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    lw $17, 24($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
-; CHECK-LIBCALL-NEXT:    ldc1 $f20, 32($sp) # 8-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    add.s $f12, $f0, $f20
+; CHECK-LIBCALL-NEXT:    sh $2, 0($16)
+; CHECK-LIBCALL-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    ldc1 $f20, 24($sp) # 8-byte Folded Reload
 ; CHECK-LIBCALL-NEXT:    jr $ra
-; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, 40
+; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, 32
   %a = load half, ptr %p, align 2
   %b = load half, ptr %q, align 2
   %r = fadd half %a, %b
@@ -260,3 +258,55 @@ define void @test_vec_fptrunc_double(<4 x double> %a, ptr %p) nounwind {
   ret void
 }
 
+define half @test_fadd_fadd(half %a, half %b, half %c) nounwind {
+; CHECK-LIBCALL-LABEL: test_fadd_fadd:
+; CHECK-LIBCALL:       # %bb.0:
+; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, -40
+; CHECK-LIBCALL-NEXT:    sdc1 $f20, 32($sp) # 8-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $17, 24($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    sw $16, 20($sp) # 4-byte Folded Spill
+; CHECK-LIBCALL-NEXT:    move $16, $6
+; CHECK-LIBCALL-NEXT:    move $17, $4
+; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    move $4, $5
+; CHECK-LIBCALL-NEXT:    mov.s $f20, $f0
+; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    move $4, $17
+; CHECK-LIBCALL-NEXT:    jal __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT:    add.s $f12, $f0, $f20
+; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    move $4, $2
+; CHECK-LIBCALL-NEXT:    mov.s $f20, $f0
+; CHECK-LIBCALL-NEXT:    jal __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT:    move $4, $16
+; CHECK-LIBCALL-NEXT:    jal __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT:    add.s $f12, $f20, $f0
+; CHECK-LIBCALL-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    lw $17, 24($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    ldc1 $f20, 32($sp) # 8-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    jr $ra
+; CHECK-LIBCALL-NEXT:    addiu $sp, $sp, 40
+  %d = fadd half %a, %b
+  %e = fadd half %d, %c
+  ret half %e
+}
+
+define half @to_half(i16 %bits) nounwind {
+; CHECK-LIBCALL-LABEL: to_half:
+; CHECK-LIBCALL:       # %bb.0:
+; CHECK-LIBCALL-NEXT:    jr $ra
+; CHECK-LIBCALL-NEXT:    move $2, $4
+  %f = bitcast i16 %bits to half
+  ret half %f
+}
+
+define i16 @from_half(half %f) nounwind {
+; CHECK-LIBCALL-LABEL: from_half:
+; CHECK-LIBCALL:       # %bb.0:
+; CHECK-LIBCALL-NEXT:    jr $ra
+; CHECK-LIBCALL-NEXT:    move $2, $4
+  %bits = bitcast half %f to i16
+  ret i16 %bits
+}
diff --git a/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll
index 37cca86..f4c78fb 100644
--- a/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll
@@ -47,17 +47,16 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; MIPSEL-NEXT:    sra $1, $1, 28
 ; MIPSEL-NEXT:    sll $2, $1, 1
 ; MIPSEL-NEXT:    addu $1, $2, $1
-; MIPSEL-NEXT:    srl $2, $1, 4
-; MIPSEL-NEXT:    srl $1, $1, 7
-; MIPSEL-NEXT:    andi $1, $1, 1
-; MIPSEL-NEXT:    addiu $3, $zero, 1
-; MIPSEL-NEXT:    addu $1, $2, $1
-; MIPSEL-NEXT:    sll $2, $1, 1
-; MIPSEL-NEXT:    sll $1, $1, 2
+; MIPSEL-NEXT:    srl $2, $1, 31
+; MIPSEL-NEXT:    srl $1, $1, 4
 ; MIPSEL-NEXT:    addu $1, $1, $2
+; MIPSEL-NEXT:    addiu $2, $zero, 1
+; MIPSEL-NEXT:    sll $3, $1, 1
+; MIPSEL-NEXT:    sll $1, $1, 2
+; MIPSEL-NEXT:    addu $1, $1, $3
 ; MIPSEL-NEXT:    subu $1, $4, $1
 ; MIPSEL-NEXT:    andi $1, $1, 15
-; MIPSEL-NEXT:    xor $1, $1, $3
+; MIPSEL-NEXT:    xor $1, $1, $2
 ; MIPSEL-NEXT:    jr $ra
 ; MIPSEL-NEXT:    sltiu $2, $1, 1
 ;
@@ -69,10 +68,9 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; MIPS64EL-NEXT:    sll $3, $2, 1
 ; MIPS64EL-NEXT:    addu $2, $3, $2
 ; MIPS64EL-NEXT:    addiu $3, $zero, 1
-; MIPS64EL-NEXT:    srl $4, $2, 4
-; MIPS64EL-NEXT:    srl $2, $2, 7
-; MIPS64EL-NEXT:    andi $2, $2, 1
-; MIPS64EL-NEXT:    addu $2, $4, $2
+; MIPS64EL-NEXT:    srl $4, $2, 31
+; MIPS64EL-NEXT:    srl $2, $2, 4
+; MIPS64EL-NEXT:    addu $2, $2, $4
 ; MIPS64EL-NEXT:    sll $4, $2, 1
 ; MIPS64EL-NEXT:    sll $2, $2, 2
 ; MIPS64EL-NEXT:    addu $2, $2, $4
diff --git a/llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll b/llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll
index 35b4780..ae23520 100644
--- a/llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll
@@ -11,23 +11,18 @@ define dso_local fastcc void @BuildVectorICE() unnamed_addr {
 ; 32BIT-NEXT:    stwu 1, -48(1)
 ; 32BIT-NEXT:    .cfi_def_cfa_offset 48
 ; 32BIT-NEXT:    lxvw4x 34, 0, 3
-; 32BIT-NEXT:    li 3, .LCPI0_0@l
-; 32BIT-NEXT:    lis 4, .LCPI0_0@ha
 ; 32BIT-NEXT:    li 5, 0
-; 32BIT-NEXT:    xxlxor 36, 36, 36
-; 32BIT-NEXT:    lxvw4x 35, 4, 3
 ; 32BIT-NEXT:    addi 3, 1, 16
 ; 32BIT-NEXT:    addi 4, 1, 32
-; 32BIT-NEXT:    .p2align 4
+; 32BIT-NEXT:    xxspltw 35, 34, 1
+; 32BIT-NEXT:    .p2align 5
 ; 32BIT-NEXT:  .LBB0_1: # %while.body
 ; 32BIT-NEXT:    #
 ; 32BIT-NEXT:    stw 5, 16(1)
-; 32BIT-NEXT:    lxvw4x 37, 0, 3
-; 32BIT-NEXT:    vperm 5, 5, 4, 3
-; 32BIT-NEXT:    vadduwm 5, 2, 5
-; 32BIT-NEXT:    xxspltw 32, 37, 1
-; 32BIT-NEXT:    vadduwm 5, 5, 0
-; 32BIT-NEXT:    stxvw4x 37, 0, 4
+; 32BIT-NEXT:    lxvw4x 36, 0, 3
+; 32BIT-NEXT:    vadduwm 4, 2, 4
+; 32BIT-NEXT:    vadduwm 4, 4, 3
+; 32BIT-NEXT:    stxvw4x 36, 0, 4
 ; 32BIT-NEXT:    lwz 5, 32(1)
 ; 32BIT-NEXT:    b .LBB0_1
 ;
@@ -35,21 +30,16 @@ define dso_local fastcc void @BuildVectorICE() unnamed_addr {
 ; 64BIT:       # %bb.0: # %entry
 ; 64BIT-NEXT:    lxvw4x 34, 0, 3
 ; 64BIT-NEXT:    li 3, 0
-; 64BIT-NEXT:    rldimi 3, 3, 32, 0
-; 64BIT-NEXT:    mtfprd 0, 3
-; 64BIT-NEXT:    li 3, 0
-; 64BIT-NEXT:    .p2align 4
+; 64BIT-NEXT:    xxspltw 35, 34, 1
+; 64BIT-NEXT:    .p2align 5
 ; 64BIT-NEXT:  .LBB0_1: # %while.body
 ; 64BIT-NEXT:    #
-; 64BIT-NEXT:    li 4, 0
-; 64BIT-NEXT:    rldimi 4, 3, 32, 0
-; 64BIT-NEXT:    mtfprd 1, 4
-; 64BIT-NEXT:    xxmrghd 35, 1, 0
-; 64BIT-NEXT:    vadduwm 3, 2, 3
-; 64BIT-NEXT:    xxspltw 36, 35, 1
-; 64BIT-NEXT:    vadduwm 3, 3, 4
-; 64BIT-NEXT:    xxsldwi 1, 35, 35, 3
-; 64BIT-NEXT:    mffprwz 3, 1
+; 64BIT-NEXT:    sldi 3, 3, 32
+; 64BIT-NEXT:    mtvsrd 36, 3
+; 64BIT-NEXT:    vadduwm 4, 2, 4
+; 64BIT-NEXT:    vadduwm 4, 4, 3
+; 64BIT-NEXT:    xxsldwi 0, 36, 36, 3
+; 64BIT-NEXT:    mffprwz 3, 0
 ; 64BIT-NEXT:    b .LBB0_1
     entry:
      br label %while.body
diff --git a/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll
index 2b07f27..18b07b2 100644
--- a/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll
@@ -46,7 +46,7 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; PPC-NEXT:    slwi 4, 3, 28
 ; PPC-NEXT:    srawi 4, 4, 28
 ; PPC-NEXT:    mulli 4, 4, 3
-; PPC-NEXT:    rlwinm 5, 4, 25, 31, 31
+; PPC-NEXT:    srwi 5, 4, 31
 ; PPC-NEXT:    srwi 4, 4, 4
 ; PPC-NEXT:    add 4, 4, 5
 ; PPC-NEXT:    mulli 4, 4, 6
@@ -65,7 +65,7 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; PPC64LE-NEXT:    srawi 4, 4, 28
 ; PPC64LE-NEXT:    slwi 5, 4, 1
 ; PPC64LE-NEXT:    add 4, 4, 5
-; PPC64LE-NEXT:    rlwinm 5, 4, 25, 31, 31
+; PPC64LE-NEXT:    srwi 5, 4, 31
 ; PPC64LE-NEXT:    srwi 4, 4, 4
 ; PPC64LE-NEXT:    add 4, 4, 5
 ; PPC64LE-NEXT:    mulli 4, 4, 6
diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll
index 91ac7c5d..3d9fb91 100644
--- a/llvm/test/CodeGen/RISCV/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll
@@ -488,10 +488,9 @@ define i8 @sdiv8_constant_no_srai(i8 %a) nounwind {
 ; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    li a1, 86
 ; RV32IM-NEXT:    mul a0, a0, a1
-; RV32IM-NEXT:    srli a1, a0, 8
-; RV32IM-NEXT:    slli a0, a0, 16
-; RV32IM-NEXT:    srli a0, a0, 31
-; RV32IM-NEXT:    add a0, a1, a0
+; RV32IM-NEXT:    srli a1, a0, 31
+; RV32IM-NEXT:    srli a0, a0, 8
+; RV32IM-NEXT:    add a0, a0, a1
 ; RV32IM-NEXT:    ret
 ;
 ; RV32IMZB-LABEL: sdiv8_constant_no_srai:
@@ -499,10 +498,9 @@ define i8 @sdiv8_constant_no_srai(i8 %a) nounwind {
 ; RV32IMZB-NEXT:    sext.b a0, a0
 ; RV32IMZB-NEXT:    li a1, 86
 ; RV32IMZB-NEXT:    mul a0, a0, a1
-; RV32IMZB-NEXT:    srli a1, a0, 8
-; RV32IMZB-NEXT:    slli a0, a0, 16
-; RV32IMZB-NEXT:    srli a0, a0, 31
-; RV32IMZB-NEXT:    add a0, a1, a0
+; RV32IMZB-NEXT:    srli a1, a0, 31
+; RV32IMZB-NEXT:    srli a0, a0, 8
+; RV32IMZB-NEXT:    add a0, a0, a1
 ; RV32IMZB-NEXT:    ret
 ;
 ; RV64IM-LABEL: sdiv8_constant_no_srai:
@@ -511,10 +509,9 @@ define i8 @sdiv8_constant_no_srai(i8 %a) nounwind {
 ; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    li a1, 86
 ; RV64IM-NEXT:    mul a0, a0, a1
-; RV64IM-NEXT:    srli a1, a0, 8
-; RV64IM-NEXT:    slli a0, a0, 48
-; RV64IM-NEXT:    srli a0, a0, 63
-; RV64IM-NEXT:    add a0, a1, a0
+; RV64IM-NEXT:    srli a1, a0, 63
+; RV64IM-NEXT:    srli a0, a0, 8
+; RV64IM-NEXT:    add a0, a0, a1
 ; RV64IM-NEXT:    ret
 ;
 ; RV64IMZB-LABEL: sdiv8_constant_no_srai:
@@ -522,10 +519,9 @@ define i8 @sdiv8_constant_no_srai(i8 %a) nounwind {
 ; RV64IMZB-NEXT:    sext.b a0, a0
 ; RV64IMZB-NEXT:    li a1, 86
 ; RV64IMZB-NEXT:    mul a0, a0, a1
-; RV64IMZB-NEXT:    srli a1, a0, 8
-; RV64IMZB-NEXT:    slli a0, a0, 48
-; RV64IMZB-NEXT:    srli a0, a0, 63
-; RV64IMZB-NEXT:    add a0, a1, a0
+; RV64IMZB-NEXT:    srli a1, a0, 63
+; RV64IMZB-NEXT:    srli a0, a0, 8
+; RV64IMZB-NEXT:    add a0, a0, a1
 ; RV64IMZB-NEXT:    ret
   %1 = sdiv i8 %a, 3
   ret i8 %1
@@ -538,10 +534,9 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind {
 ; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    li a1, 103
 ; RV32IM-NEXT:    mul a0, a0, a1
-; RV32IM-NEXT:    srai a1, a0, 9
-; RV32IM-NEXT:    slli a0, a0, 16
-; RV32IM-NEXT:    srli a0, a0, 31
-; RV32IM-NEXT:    add a0, a1, a0
+; RV32IM-NEXT:    srli a1, a0, 31
+; RV32IM-NEXT:    srai a0, a0, 9
+; RV32IM-NEXT:    add a0, a0, a1
 ; RV32IM-NEXT:    ret
 ;
 ; RV32IMZB-LABEL: sdiv8_constant_srai:
@@ -549,10 +544,9 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind {
 ; RV32IMZB-NEXT:    sext.b a0, a0
 ; RV32IMZB-NEXT:    li a1, 103
 ; RV32IMZB-NEXT:    mul a0, a0, a1
-; RV32IMZB-NEXT:    srai a1, a0, 9
-; RV32IMZB-NEXT:    slli a0, a0, 16
-; RV32IMZB-NEXT:    srli a0, a0, 31
-; RV32IMZB-NEXT:    add a0, a1, a0
+; RV32IMZB-NEXT:    srli a1, a0, 31
+; RV32IMZB-NEXT:    srai a0, a0, 9
+; RV32IMZB-NEXT:    add a0, a0, a1
 ; RV32IMZB-NEXT:    ret
 ;
 ; RV64IM-LABEL: sdiv8_constant_srai:
@@ -561,10 +555,9 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind {
 ; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    li a1, 103
 ; RV64IM-NEXT:    mul a0, a0, a1
-; RV64IM-NEXT:    srai a1, a0, 9
-; RV64IM-NEXT:    slli a0, a0, 48
-; RV64IM-NEXT:    srli a0, a0, 63
-; RV64IM-NEXT:    add a0, a1, a0
+; RV64IM-NEXT:    srli a1, a0, 63
+; RV64IM-NEXT:    srai a0, a0, 9
+; RV64IM-NEXT:    add a0, a0, a1
 ; RV64IM-NEXT:    ret
 ;
 ; RV64IMZB-LABEL: sdiv8_constant_srai:
@@ -572,10 +565,9 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind {
 ; RV64IMZB-NEXT:    sext.b a0, a0
 ; RV64IMZB-NEXT:    li a1, 103
 ; RV64IMZB-NEXT:    mul a0, a0, a1
-; RV64IMZB-NEXT:    srai a1, a0, 9
-; RV64IMZB-NEXT:    slli a0, a0, 48
-; RV64IMZB-NEXT:    srli a0, a0, 63
-; RV64IMZB-NEXT:    add a0, a1, a0
+; RV64IMZB-NEXT:    srli a1, a0, 63
+; RV64IMZB-NEXT:    srai a0, a0, 9
+; RV64IMZB-NEXT:    add a0, a0, a1
 ; RV64IMZB-NEXT:    ret
   %1 = sdiv i8 %a, 5
   ret i8 %1
@@ -728,7 +720,7 @@ define i16 @sdiv16_constant_no_srai(i16 %a) nounwind {
 ; RV64IM-NEXT:    lui a1, 5
 ; RV64IM-NEXT:    addiw a1, a1, 1366
 ; RV64IM-NEXT:    mul a0, a0, a1
-; RV64IM-NEXT:    srliw a1, a0, 31
+; RV64IM-NEXT:    srli a1, a0, 63
 ; RV64IM-NEXT:    srli a0, a0, 16
 ; RV64IM-NEXT:    add a0, a0, a1
 ; RV64IM-NEXT:    ret
@@ -739,7 +731,7 @@ define i16 @sdiv16_constant_no_srai(i16 %a) nounwind {
 ; RV64IMZB-NEXT:    lui a1, 5
 ; RV64IMZB-NEXT:    addiw a1, a1, 1366
 ; RV64IMZB-NEXT:    mul a0, a0, a1
-; RV64IMZB-NEXT:    srliw a1, a0, 31
+; RV64IMZB-NEXT:    srli a1, a0, 63
 ; RV64IMZB-NEXT:    srli a0, a0, 16
 ; RV64IMZB-NEXT:    add a0, a0, a1
 ; RV64IMZB-NEXT:    ret
@@ -778,7 +770,7 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind {
 ; RV64IM-NEXT:    lui a1, 6
 ; RV64IM-NEXT:    addiw a1, a1, 1639
 ; RV64IM-NEXT:    mul a0, a0, a1
-; RV64IM-NEXT:    srliw a1, a0, 31
+; RV64IM-NEXT:    srli a1, a0, 63
 ; RV64IM-NEXT:    srai a0, a0, 17
 ; RV64IM-NEXT:    add a0, a0, a1
 ; RV64IM-NEXT:    ret
@@ -789,7 +781,7 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind {
 ; RV64IMZB-NEXT:    lui a1, 6
 ; RV64IMZB-NEXT:    addiw a1, a1, 1639
 ; RV64IMZB-NEXT:    mul a0, a0, a1
-; RV64IMZB-NEXT:    srliw a1, a0, 31
+; RV64IMZB-NEXT:    srli a1, a0, 63
 ; RV64IMZB-NEXT:    srai a0, a0, 17
 ; RV64IMZB-NEXT:    add a0, a0, a1
 ; RV64IMZB-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/div.ll b/llvm/test/CodeGen/RISCV/div.ll
index f4e6769..e94efbe 100644
--- a/llvm/test/CodeGen/RISCV/div.ll
+++ b/llvm/test/CodeGen/RISCV/div.ll
@@ -980,10 +980,9 @@ define i8 @sdiv8_constant(i8 %a) nounwind {
 ; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    li a1, 103
 ; RV32IM-NEXT:    mul a0, a0, a1
-; RV32IM-NEXT:    srai a1, a0, 9
-; RV32IM-NEXT:    slli a0, a0, 16
-; RV32IM-NEXT:    srli a0, a0, 31
-; RV32IM-NEXT:    add a0, a1, a0
+; RV32IM-NEXT:    srli a1, a0, 31
+; RV32IM-NEXT:    srai a0, a0, 9
+; RV32IM-NEXT:    add a0, a0, a1
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: sdiv8_constant:
@@ -1004,10 +1003,9 @@ define i8 @sdiv8_constant(i8 %a) nounwind {
 ; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    li a1, 103
 ; RV64IM-NEXT:    mul a0, a0, a1
-; RV64IM-NEXT:    srai a1, a0, 9
-; RV64IM-NEXT:    slli a0, a0, 48
-; RV64IM-NEXT:    srli a0, a0, 63
-; RV64IM-NEXT:    add a0, a1, a0
+; RV64IM-NEXT:    srli a1, a0, 63
+; RV64IM-NEXT:    srai a0, a0, 9
+; RV64IM-NEXT:    add a0, a0, a1
 ; RV64IM-NEXT:    ret
   %1 = sdiv i8 %a, 5
   ret i8 %1
@@ -1193,7 +1191,7 @@ define i16 @sdiv16_constant(i16 %a) nounwind {
 ; RV64IM-NEXT:    lui a1, 6
 ; RV64IM-NEXT:    addiw a1, a1, 1639
 ; RV64IM-NEXT:    mul a0, a0, a1
-; RV64IM-NEXT:    srliw a1, a0, 31
+; RV64IM-NEXT:    srli a1, a0, 63
 ; RV64IM-NEXT:    srai a0, a0, 17
 ; RV64IM-NEXT:    add a0, a0, a1
 ; RV64IM-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll
index 9ad1d71..de459da 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll
@@ -1,12 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zvfh -verify-machineinstrs \
-; RUN:     -target-abi=ilp32d < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zvfh -verify-machineinstrs \
-; RUN:     -target-abi=lp64d < %s | FileCheck %s --check-prefixes=CHECK,RV64
-; RUN: llc -mtriple=riscv32 -mattr=+zve32f,+zvl128b,+d,+zvfh \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zvfh,+zfbfmin,+zvfbfmin -verify-machineinstrs \
+; RUN:     -target-abi=ilp32d < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zvfh,+zfbfmin,+zvfbfmin -verify-machineinstrs \
+; RUN:     -target-abi=lp64d < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin -verify-machineinstrs \
+; RUN:     -target-abi=ilp32d < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin -verify-machineinstrs \
+; RUN:     -target-abi=lp64d < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+zve32f,+zvl128b,+d,+zvfh,+zfbfmin,+zvfbfmin \
 ; RUN:     -verify-machineinstrs -target-abi=ilp32d < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ELEN32,RV32ELEN32
-; RUN: llc -mtriple=riscv64 -mattr=+zve32f,+zvl128b,+d,+zvfh \
+; RUN: llc -mtriple=riscv64 -mattr=+zve32f,+zvl128b,+d,+zvfh,+zfbfmin,+zvfbfmin \
 ; RUN:     -verify-machineinstrs -target-abi=lp64d < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ELEN32,RV64ELEN32
 
@@ -262,13 +266,92 @@ define i64 @bitcast_v1i64_i64(<1 x i64> %a) {
   ret i64 %b
 }
 
-define half @bitcast_v2i8_f16(<2 x i8> %a) {
-; CHECK-LABEL: bitcast_v2i8_f16:
+define bfloat @bitcast_v2i8_bf16(<2 x i8> %a) {
+; CHECK-LABEL: bitcast_v2i8_bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    fmv.h.x fa0, a0
+; CHECK-NEXT:    ret
+;
+; ELEN32-LABEL: bitcast_v2i8_bf16:
+; ELEN32:       # %bb.0:
+; ELEN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ELEN32-NEXT:    vmv.x.s a0, v8
+; ELEN32-NEXT:    fmv.h.x fa0, a0
+; ELEN32-NEXT:    ret
+  %b = bitcast <2 x i8> %a to bfloat
+  ret bfloat %b
+}
+
+define bfloat @bitcast_v1i16_bf16(<1 x i16> %a) {
+; CHECK-LABEL: bitcast_v1i16_bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    fmv.h.x fa0, a0
+; CHECK-NEXT:    ret
+;
+; ELEN32-LABEL: bitcast_v1i16_bf16:
+; ELEN32:       # %bb.0:
+; ELEN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ELEN32-NEXT:    vmv.x.s a0, v8
+; ELEN32-NEXT:    fmv.h.x fa0, a0
+; ELEN32-NEXT:    ret
+  %b = bitcast <1 x i16> %a to bfloat
+  ret bfloat %b
+}
+
+define bfloat @bitcast_v1bf16_bf16(<1 x bfloat> %a) {
+; CHECK-LABEL: bitcast_v1bf16_bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    fmv.h.x fa0, a0
+; CHECK-NEXT:    ret
+;
+; ELEN32-LABEL: bitcast_v1bf16_bf16:
+; ELEN32:       # %bb.0:
+; ELEN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ELEN32-NEXT:    vmv.x.s a0, v8
+; ELEN32-NEXT:    fmv.h.x fa0, a0
+; ELEN32-NEXT:    ret
+  %b = bitcast <1 x bfloat> %a to bfloat
+  ret bfloat %b
+}
+
+define <1 x bfloat> @bitcast_bf16_v1bf16(bfloat %a) {
+; CHECK-LABEL: bitcast_bf16_v1bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmv.x.h a0, fa0
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 ;
+; ELEN32-LABEL: bitcast_bf16_v1bf16:
+; ELEN32:       # %bb.0:
+; ELEN32-NEXT:    fmv.x.h a0, fa0
+; ELEN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ELEN32-NEXT:    vmv.s.x v8, a0
+; ELEN32-NEXT:    ret
+  %b = bitcast bfloat %a to <1 x bfloat>
+  ret <1 x bfloat> %b
+}
+
+define half @bitcast_v2i8_f16(<2 x i8> %a) {
+; ZVFH-LABEL: bitcast_v2i8_f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: bitcast_v2i8_f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    fmv.h.x fa0, a0
+; ZVFHMIN-NEXT:    ret
+;
 ; ELEN32-LABEL: bitcast_v2i8_f16:
 ; ELEN32:       # %bb.0:
 ; ELEN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
@@ -279,11 +362,18 @@ define half @bitcast_v2i8_f16(<2 x i8> %a) {
 }
 
 define half @bitcast_v1i16_f16(<1 x i16> %a) {
-; CHECK-LABEL: bitcast_v1i16_f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: bitcast_v1i16_f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: bitcast_v1i16_f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    fmv.h.x fa0, a0
+; ZVFHMIN-NEXT:    ret
 ;
 ; ELEN32-LABEL: bitcast_v1i16_f16:
 ; ELEN32:       # %bb.0:
@@ -294,6 +384,52 @@ define half @bitcast_v1i16_f16(<1 x i16> %a) {
   ret half %b
 }
 
+define half @bitcast_v1f16_f16(<1 x half> %a) {
+; ZVFH-LABEL: bitcast_v1f16_f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: bitcast_v1f16_f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    fmv.h.x fa0, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ELEN32-LABEL: bitcast_v1f16_f16:
+; ELEN32:       # %bb.0:
+; ELEN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ELEN32-NEXT:    vfmv.f.s fa0, v8
+; ELEN32-NEXT:    ret
+  %b = bitcast <1 x half> %a to half
+  ret half %b
+}
+
+define <1 x half> @bitcast_f16_v1f16(half %a) {
+; ZVFH-LABEL: bitcast_f16_v1f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFH-NEXT:    vfmv.s.f v8, fa0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: bitcast_f16_v1f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.s.x v8, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ELEN32-LABEL: bitcast_f16_v1f16:
+; ELEN32:       # %bb.0:
+; ELEN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ELEN32-NEXT:    vfmv.s.f v8, fa0
+; ELEN32-NEXT:    ret
+  %b = bitcast half %a to <1 x half>
+  ret <1 x half> %b
+}
+
 define float @bitcast_v4i8_f32(<4 x i8> %a) {
 ; CHECK-LABEL: bitcast_v4i8_f32:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
index f2052cc..cb830d6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
@@ -1,8 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32NOM
-; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh,+f,+d,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32M
-; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64NOM
-; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh,+f,+d,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64M
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh,+zfbfmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV32,RV32NOM
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh,+zfbfmin,+zvfbfmin,+f,+d,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV32,RV32M
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh,+zfbfmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV64,RV64NOM
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh,+zfbfmin,+zvfbfmin,+f,+d,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV64,RV64M
+
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+f,+d,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV32,RV32M
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+f,+d,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV64,RV64M
 
 define i8 @extractelt_v16i8(ptr %x) nounwind {
 ; CHECK-LABEL: extractelt_v16i8:
@@ -66,14 +69,37 @@ define i64 @extractelt_v2i64(ptr %x) nounwind {
   ret i64 %b
 }
 
-define half @extractelt_v8f16(ptr %x) nounwind {
-; CHECK-LABEL: extractelt_v8f16:
+define bfloat @extractelt_v8bf16(ptr %x) nounwind {
+; CHECK-LABEL: extractelt_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 7
-; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    fmv.h.x fa0, a0
 ; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = extractelement <8 x bfloat> %a, i32 7
+  ret bfloat %b
+}
+
+define half @extractelt_v8f16(ptr %x) nounwind {
+; ZVFH-LABEL: extractelt_v8f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFH-NEXT:    vle16.v v8, (a0)
+; ZVFH-NEXT:    vslidedown.vi v8, v8, 7
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: extractelt_v8f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 7
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    fmv.h.x fa0, a0
+; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = extractelement <8 x half> %a, i32 7
   ret half %b
@@ -171,15 +197,40 @@ define i64 @extractelt_v4i64(ptr %x) nounwind {
   ret i64 %b
 }
 
-define half @extractelt_v16f16(ptr %x) nounwind {
-; CHECK-LABEL: extractelt_v16f16:
+define bfloat @extractelt_v16bf16(ptr %x) nounwind {
+; CHECK-LABEL: extractelt_v16bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 7
-; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    fmv.h.x fa0, a0
 ; CHECK-NEXT:    ret
+  %a = load <16 x bfloat>, ptr %x
+  %b = extractelement <16 x bfloat> %a, i32 7
+  ret bfloat %b
+}
+
+define half @extractelt_v16f16(ptr %x) nounwind {
+; ZVFH-LABEL: extractelt_v16f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFH-NEXT:    vle16.v v8, (a0)
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFH-NEXT:    vslidedown.vi v8, v8, 7
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: extractelt_v16f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 7
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    fmv.h.x fa0, a0
+; ZVFHMIN-NEXT:    ret
   %a = load <16 x half>, ptr %x
   %b = extractelement <16 x half> %a, i32 7
   ret half %b
@@ -398,15 +449,49 @@ define i64 @extractelt_v2i64_idx(ptr %x, i32 zeroext %idx) nounwind {
   ret i64 %c
 }
 
-define half @extractelt_v8f16_idx(ptr %x, i32 zeroext %idx) nounwind {
-; CHECK-LABEL: extractelt_v8f16_idx:
+define bfloat @extractelt_v8bf16_idx(ptr %x, i32 zeroext %idx) nounwind {
+; CHECK-LABEL: extractelt_v8bf16_idx:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vfadd.vv v8, v8, v8
-; CHECK-NEXT:    vslidedown.vx v8, v8, a1
-; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v10, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vslidedown.vx v8, v10, a1
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    fmv.h.x fa0, a0
 ; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = fadd <8 x bfloat> %a, %a
+  %c = extractelement <8 x bfloat> %b, i32 %idx
+  ret bfloat %c
+}
+
+define half @extractelt_v8f16_idx(ptr %x, i32 zeroext %idx) nounwind {
+; ZVFH-LABEL: extractelt_v8f16_idx:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFH-NEXT:    vle16.v v8, (a0)
+; ZVFH-NEXT:    vfadd.vv v8, v8, v8
+; ZVFH-NEXT:    vslidedown.vx v8, v8, a1
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: extractelt_v8f16_idx:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v10, a1
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    fmv.h.x fa0, a0
+; ZVFHMIN-NEXT:    ret
   %a = load <8 x half>, ptr %x
   %b = fadd <8 x half> %a, %a
   %c = extractelement <8 x half> %b, i32 %idx
@@ -513,15 +598,49 @@ define i64 @extractelt_v4i64_idx(ptr %x, i32 zeroext %idx) nounwind {
   ret i64 %c
 }
 
-define half @extractelt_v16f16_idx(ptr %x, i32 zeroext %idx) nounwind {
-; CHECK-LABEL: extractelt_v16f16_idx:
+define bfloat @extractelt_v16bf16_idx(ptr %x, i32 zeroext %idx) nounwind {
+; CHECK-LABEL: extractelt_v16bf16_idx:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vfadd.vv v8, v8, v8
-; CHECK-NEXT:    vslidedown.vx v8, v8, a1
-; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v12, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v8
+; CHECK-NEXT:    vslidedown.vx v8, v12, a1
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    fmv.h.x fa0, a0
 ; CHECK-NEXT:    ret
+  %a = load <16 x bfloat>, ptr %x
+  %b = fadd <16 x bfloat> %a, %a
+  %c = extractelement <16 x bfloat> %b, i32 %idx
+  ret bfloat %c
+}
+
+define half @extractelt_v16f16_idx(ptr %x, i32 zeroext %idx) nounwind {
+; ZVFH-LABEL: extractelt_v16f16_idx:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFH-NEXT:    vle16.v v8, (a0)
+; ZVFH-NEXT:    vfadd.vv v8, v8, v8
+; ZVFH-NEXT:    vslidedown.vx v8, v8, a1
+; ZVFH-NEXT:    vfmv.f.s fa0, v8
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: extractelt_v16f16_idx:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v8, v12, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v8
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v12, a1
+; ZVFHMIN-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN-NEXT:    fmv.h.x fa0, a0
+; ZVFHMIN-NEXT:    ret
   %a = load <16 x half>, ptr %x
   %b = fadd <16 x half> %a, %a
   %c = extractelement <16 x half> %b, i32 %idx
@@ -939,8 +1058,8 @@ define i32 @extractelt_mul_v4i32(<4 x i32> %x) {
 define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) {
 ; RV32NOM-LABEL: extractelt_sdiv_v4i32:
 ; RV32NOM:       # %bb.0:
-; RV32NOM-NEXT:    lui a0, %hi(.LCPI42_0)
-; RV32NOM-NEXT:    addi a0, a0, %lo(.LCPI42_0)
+; RV32NOM-NEXT:    lui a0, %hi(.LCPI46_0)
+; RV32NOM-NEXT:    addi a0, a0, %lo(.LCPI46_0)
 ; RV32NOM-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32NOM-NEXT:    vle32.v v9, (a0)
 ; RV32NOM-NEXT:    vmulh.vv v9, v8, v9
@@ -975,8 +1094,8 @@ define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) {
 ;
 ; RV64NOM-LABEL: extractelt_sdiv_v4i32:
 ; RV64NOM:       # %bb.0:
-; RV64NOM-NEXT:    lui a0, %hi(.LCPI42_0)
-; RV64NOM-NEXT:    addi a0, a0, %lo(.LCPI42_0)
+; RV64NOM-NEXT:    lui a0, %hi(.LCPI46_0)
+; RV64NOM-NEXT:    addi a0, a0, %lo(.LCPI46_0)
 ; RV64NOM-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64NOM-NEXT:    vle32.v v9, (a0)
 ; RV64NOM-NEXT:    vmulh.vv v9, v8, v9
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll
index bdedc5f..3f7cd91 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll
@@ -21,58 +21,18 @@ define <4 x bfloat> @splat_idx_v4bf16(<4 x bfloat> %v, i64 %idx) {
 ;
 ; RV32-ZFBFMIN-LABEL: splat_idx_v4bf16:
 ; RV32-ZFBFMIN:       # %bb.0:
-; RV32-ZFBFMIN-NEXT:    addi sp, sp, -48
-; RV32-ZFBFMIN-NEXT:    .cfi_def_cfa_offset 48
-; RV32-ZFBFMIN-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
-; RV32-ZFBFMIN-NEXT:    .cfi_offset ra, -4
-; RV32-ZFBFMIN-NEXT:    csrr a1, vlenb
-; RV32-ZFBFMIN-NEXT:    sub sp, sp, a1
-; RV32-ZFBFMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb
-; RV32-ZFBFMIN-NEXT:    addi a1, sp, 32
-; RV32-ZFBFMIN-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-ZFBFMIN-NEXT:    andi a0, a0, 3
-; RV32-ZFBFMIN-NEXT:    li a1, 2
-; RV32-ZFBFMIN-NEXT:    call __mulsi3
-; RV32-ZFBFMIN-NEXT:    addi a1, sp, 16
-; RV32-ZFBFMIN-NEXT:    add a0, a1, a0
-; RV32-ZFBFMIN-NEXT:    addi a2, sp, 32
-; RV32-ZFBFMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-ZFBFMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; RV32-ZFBFMIN-NEXT:    vse16.v v8, (a1)
-; RV32-ZFBFMIN-NEXT:    lh a0, 0(a0)
+; RV32-ZFBFMIN-NEXT:    vslidedown.vx v8, v8, a0
+; RV32-ZFBFMIN-NEXT:    vmv.x.s a0, v8
 ; RV32-ZFBFMIN-NEXT:    vmv.v.x v8, a0
-; RV32-ZFBFMIN-NEXT:    csrr a0, vlenb
-; RV32-ZFBFMIN-NEXT:    add sp, sp, a0
-; RV32-ZFBFMIN-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
-; RV32-ZFBFMIN-NEXT:    addi sp, sp, 48
 ; RV32-ZFBFMIN-NEXT:    ret
 ;
 ; RV64-ZFBFMIN-LABEL: splat_idx_v4bf16:
 ; RV64-ZFBFMIN:       # %bb.0:
-; RV64-ZFBFMIN-NEXT:    addi sp, sp, -48
-; RV64-ZFBFMIN-NEXT:    .cfi_def_cfa_offset 48
-; RV64-ZFBFMIN-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64-ZFBFMIN-NEXT:    .cfi_offset ra, -8
-; RV64-ZFBFMIN-NEXT:    csrr a1, vlenb
-; RV64-ZFBFMIN-NEXT:    sub sp, sp, a1
-; RV64-ZFBFMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb
-; RV64-ZFBFMIN-NEXT:    addi a1, sp, 32
-; RV64-ZFBFMIN-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
-; RV64-ZFBFMIN-NEXT:    andi a0, a0, 3
-; RV64-ZFBFMIN-NEXT:    li a1, 2
-; RV64-ZFBFMIN-NEXT:    call __muldi3
-; RV64-ZFBFMIN-NEXT:    addi a1, sp, 16
-; RV64-ZFBFMIN-NEXT:    add a0, a1, a0
-; RV64-ZFBFMIN-NEXT:    addi a2, sp, 32
-; RV64-ZFBFMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
 ; RV64-ZFBFMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; RV64-ZFBFMIN-NEXT:    vse16.v v8, (a1)
-; RV64-ZFBFMIN-NEXT:    lh a0, 0(a0)
+; RV64-ZFBFMIN-NEXT:    vslidedown.vx v8, v8, a0
+; RV64-ZFBFMIN-NEXT:    vmv.x.s a0, v8
 ; RV64-ZFBFMIN-NEXT:    vmv.v.x v8, a0
-; RV64-ZFBFMIN-NEXT:    csrr a0, vlenb
-; RV64-ZFBFMIN-NEXT:    add sp, sp, a0
-; RV64-ZFBFMIN-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64-ZFBFMIN-NEXT:    addi sp, sp, 48
 ; RV64-ZFBFMIN-NEXT:    ret
   %x = extractelement <4 x bfloat> %v, i64 %idx
   %ins = insertelement <4 x bfloat> poison, bfloat %x, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index 924732e..7e21983 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -201,58 +201,18 @@ define <4 x half> @splat_idx_v4f16(<4 x half> %v, i64 %idx) {
 ;
 ; RV32-ZFHMIN-LABEL: splat_idx_v4f16:
 ; RV32-ZFHMIN:       # %bb.0:
-; RV32-ZFHMIN-NEXT:    addi sp, sp, -48
-; RV32-ZFHMIN-NEXT:    .cfi_def_cfa_offset 48
-; RV32-ZFHMIN-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
-; RV32-ZFHMIN-NEXT:    .cfi_offset ra, -4
-; RV32-ZFHMIN-NEXT:    csrr a1, vlenb
-; RV32-ZFHMIN-NEXT:    sub sp, sp, a1
-; RV32-ZFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb
-; RV32-ZFHMIN-NEXT:    addi a1, sp, 32
-; RV32-ZFHMIN-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-ZFHMIN-NEXT:    andi a0, a0, 3
-; RV32-ZFHMIN-NEXT:    li a1, 2
-; RV32-ZFHMIN-NEXT:    call __mulsi3
-; RV32-ZFHMIN-NEXT:    addi a1, sp, 16
-; RV32-ZFHMIN-NEXT:    add a0, a1, a0
-; RV32-ZFHMIN-NEXT:    addi a2, sp, 32
-; RV32-ZFHMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-ZFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; RV32-ZFHMIN-NEXT:    vse16.v v8, (a1)
-; RV32-ZFHMIN-NEXT:    lh a0, 0(a0)
+; RV32-ZFHMIN-NEXT:    vslidedown.vx v8, v8, a0
+; RV32-ZFHMIN-NEXT:    vmv.x.s a0, v8
 ; RV32-ZFHMIN-NEXT:    vmv.v.x v8, a0
-; RV32-ZFHMIN-NEXT:    csrr a0, vlenb
-; RV32-ZFHMIN-NEXT:    add sp, sp, a0
-; RV32-ZFHMIN-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
-; RV32-ZFHMIN-NEXT:    addi sp, sp, 48
 ; RV32-ZFHMIN-NEXT:    ret
 ;
 ; RV64-ZFHMIN-LABEL: splat_idx_v4f16:
 ; RV64-ZFHMIN:       # %bb.0:
-; RV64-ZFHMIN-NEXT:    addi sp, sp, -48
-; RV64-ZFHMIN-NEXT:    .cfi_def_cfa_offset 48
-; RV64-ZFHMIN-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; RV64-ZFHMIN-NEXT:    .cfi_offset ra, -8
-; RV64-ZFHMIN-NEXT:    csrr a1, vlenb
-; RV64-ZFHMIN-NEXT:    sub sp, sp, a1
-; RV64-ZFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb
-; RV64-ZFHMIN-NEXT:    addi a1, sp, 32
-; RV64-ZFHMIN-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
-; RV64-ZFHMIN-NEXT:    andi a0, a0, 3
-; RV64-ZFHMIN-NEXT:    li a1, 2
-; RV64-ZFHMIN-NEXT:    call __muldi3
-; RV64-ZFHMIN-NEXT:    addi a1, sp, 16
-; RV64-ZFHMIN-NEXT:    add a0, a1, a0
-; RV64-ZFHMIN-NEXT:    addi a2, sp, 32
-; RV64-ZFHMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
 ; RV64-ZFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; RV64-ZFHMIN-NEXT:    vse16.v v8, (a1)
-; RV64-ZFHMIN-NEXT:    lh a0, 0(a0)
+; RV64-ZFHMIN-NEXT:    vslidedown.vx v8, v8, a0
+; RV64-ZFHMIN-NEXT:    vmv.x.s a0, v8
 ; RV64-ZFHMIN-NEXT:    vmv.v.x v8, a0
-; RV64-ZFHMIN-NEXT:    csrr a0, vlenb
-; RV64-ZFHMIN-NEXT:    add sp, sp, a0
-; RV64-ZFHMIN-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; RV64-ZFHMIN-NEXT:    addi sp, sp, 48
 ; RV64-ZFHMIN-NEXT:    ret
   %x = extractelement <4 x half> %v, i64 %idx
   %ins = insertelement <4 x half> poison, half %x, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
index 87f9bfb..5524983 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh,+zfbfmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,ZVFH
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh,+zfbfmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,ZVFH
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,ZVFHMIN,ZVFHMINRV32
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,ZVFHMIN,ZVFHMINRV64
 
 define <4 x i32> @insertelt_v4i32_0(<4 x i32> %a, i32 %y) {
 ; CHECK-LABEL: insertelt_v4i32_0:
@@ -673,3 +675,102 @@ define <8 x i64> @insertelt_c5_v8xi64_exact(<8 x i64> %vin, i64 %a) vscale_range
   %v = insertelement <8 x i64> %vin, i64 %a, i32 5
   ret <8 x i64> %v
 }
+
+define <4 x bfloat> @insertelt_v4bf16_0(<4 x bfloat> %a, bfloat %y) {
+; CHECK-LABEL: insertelt_v4bf16_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmv.x.h a0, fa0
+; CHECK-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
+; CHECK-NEXT:    vmv.s.x v8, a0
+; CHECK-NEXT:    ret
+  %b = insertelement <4 x bfloat> %a, bfloat %y, i32 0
+  ret <4 x bfloat> %b
+}
+
+define <4 x bfloat> @insertelt_v4bf16_3(<4 x bfloat> %a, bfloat %y) {
+; CHECK-LABEL: insertelt_v4bf16_3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmv.x.h a0, fa0
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv.s.x v9, a0
+; CHECK-NEXT:    vslideup.vi v8, v9, 3
+; CHECK-NEXT:    ret
+  %b = insertelement <4 x bfloat> %a, bfloat %y, i32 3
+  ret <4 x bfloat> %b
+}
+
+define <4 x bfloat> @insertelt_v4bf16_idx(<4 x bfloat> %a, bfloat %y, i32 zeroext %idx) {
+; CHECK-LABEL: insertelt_v4bf16_idx:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, 1
+; CHECK-NEXT:    fmv.x.h a2, fa0
+; CHECK-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v9, a2
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %b = insertelement <4 x bfloat> %a, bfloat %y, i32 %idx
+  ret <4 x bfloat> %b
+}
+
+define <4 x half> @insertelt_v4f16_0(<4 x half> %a, half %y) {
+; ZVFH-LABEL: insertelt_v4f16_0:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
+; ZVFH-NEXT:    vfmv.s.f v8, fa0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: insertelt_v4f16_0:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
+; ZVFHMIN-NEXT:    vmv.s.x v8, a0
+; ZVFHMIN-NEXT:    ret
+  %b = insertelement <4 x half> %a, half %y, i32 0
+  ret <4 x half> %b
+}
+
+define <4 x half> @insertelt_v4f16_3(<4 x half> %a, half %y) {
+; ZVFH-LABEL: insertelt_v4f16_3:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfmv.s.f v9, fa0
+; ZVFH-NEXT:    vslideup.vi v8, v9, 3
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: insertelt_v4f16_3:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.s.x v9, a0
+; ZVFHMIN-NEXT:    vslideup.vi v8, v9, 3
+; ZVFHMIN-NEXT:    ret
+  %b = insertelement <4 x half> %a, half %y, i32 3
+  ret <4 x half> %b
+}
+
+define <4 x half> @insertelt_v4f16_idx(<4 x half> %a, half %y, i32 zeroext %idx) {
+; ZVFH-LABEL: insertelt_v4f16_idx:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi a1, a0, 1
+; ZVFH-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
+; ZVFH-NEXT:    vfmv.s.f v9, fa0
+; ZVFH-NEXT:    vsetvli zero, a1, e16, mf2, tu, ma
+; ZVFH-NEXT:    vslideup.vx v8, v9, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: insertelt_v4f16_idx:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi a1, a0, 1
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.s.x v9, a2
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, mf2, tu, ma
+; ZVFHMIN-NEXT:    vslideup.vx v8, v9, a0
+; ZVFHMIN-NEXT:    ret
+  %b = insertelement <4 x half> %a, half %y, i32 %idx
+  ret <4 x half> %b
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ZVFHMINRV32: {{.*}}
+; ZVFHMINRV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index b3cc834..a5419c7c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -126,6 +126,56 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @load_
   ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res5
 }
 
+define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @load_factor7(ptr %ptr) {
+; CHECK-LABEL: load_factor7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vlseg7e16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = load <14 x i16>, ptr %ptr
+  %v0 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> <i32 0, i32 7>
+  %v1 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> <i32 1, i32 8>
+  %v2 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> <i32 2, i32 9>
+  %v3 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> <i32 3, i32 10>
+  %v4 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> <i32 4, i32 11>
+  %v5 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> <i32 5, i32 12>
+  %v6 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> <i32 6, i32 13>
+  %res0 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} undef, <2 x i16> %v0, 0
+  %res1 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res0, <2 x i16> %v1, 1
+  %res2 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res1, <2 x i16> %v2, 2
+  %res3 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res2, <2 x i16> %v3, 3
+  %res4 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res3, <2 x i16> %v4, 4
+  %res5 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res4, <2 x i16> %v5, 5
+  %res6 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res5, <2 x i16> %v6, 6
+  ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res6
+}
+
+define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @load_factor8(ptr %ptr) {
+; CHECK-LABEL: load_factor8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = load <16 x i16>, ptr %ptr
+  %v0 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 0, i32 8>
+  %v1 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 1, i32 9>
+  %v2 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 2, i32 10>
+  %v3 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 3, i32 11>
+  %v4 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 4, i32 12>
+  %v5 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 5, i32 13>
+  %v6 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 6, i32 14>
+  %v7 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 7, i32 15>
+  %res0 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} undef, <2 x i16> %v0, 0
+  %res1 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res0, <2 x i16> %v1, 1
+  %res2 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res1, <2 x i16> %v2, 2
+  %res3 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res2, <2 x i16> %v3, 3
+  %res4 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res3, <2 x i16> %v4, 4
+  %res5 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res4, <2 x i16> %v5, 5
+  %res6 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res5, <2 x i16> %v6, 6
+  %res7 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res6, <2 x i16> %v7, 7
+  ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res7
+}
+
 ; LMUL * NF is > 8 here and so shouldn't be lowered to a vlseg
 define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_factor6_too_big(ptr %ptr) {
 ; RV32-LABEL: load_factor6_too_big:
@@ -174,12 +224,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vslideup.vi v4, v8, 10, v0.t
-; RV32-NEXT:    lui a4, %hi(.LCPI6_0)
-; RV32-NEXT:    addi a4, a4, %lo(.LCPI6_0)
+; RV32-NEXT:    lui a4, %hi(.LCPI8_0)
+; RV32-NEXT:    addi a4, a4, %lo(.LCPI8_0)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    vle16.v v0, (a4)
-; RV32-NEXT:    lui a4, %hi(.LCPI6_1)
-; RV32-NEXT:    addi a4, a4, %lo(.LCPI6_1)
+; RV32-NEXT:    lui a4, %hi(.LCPI8_1)
+; RV32-NEXT:    addi a4, a4, %lo(.LCPI8_1)
 ; RV32-NEXT:    lui a5, 1
 ; RV32-NEXT:    vle16.v v8, (a4)
 ; RV32-NEXT:    csrr a4, vlenb
@@ -260,10 +310,10 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, %hi(.LCPI6_2)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_2)
-; RV32-NEXT:    lui a3, %hi(.LCPI6_3)
-; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_3)
+; RV32-NEXT:    lui a1, %hi(.LCPI8_2)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_2)
+; RV32-NEXT:    lui a3, %hi(.LCPI8_3)
+; RV32-NEXT:    addi a3, a3, %lo(.LCPI8_3)
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; RV32-NEXT:    vle16.v v12, (a1)
 ; RV32-NEXT:    vle16.v v8, (a3)
@@ -273,8 +323,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, %hi(.LCPI6_4)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_4)
+; RV32-NEXT:    lui a1, %hi(.LCPI8_4)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_4)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v2, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
@@ -340,10 +390,10 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, %hi(.LCPI6_5)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_5)
-; RV32-NEXT:    lui a3, %hi(.LCPI6_6)
-; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_6)
+; RV32-NEXT:    lui a1, %hi(.LCPI8_5)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_5)
+; RV32-NEXT:    lui a3, %hi(.LCPI8_6)
+; RV32-NEXT:    addi a3, a3, %lo(.LCPI8_6)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    vle16.v v24, (a1)
 ; RV32-NEXT:    vle16.v v4, (a3)
@@ -368,14 +418,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, %hi(.LCPI6_7)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_7)
-; RV32-NEXT:    lui a3, %hi(.LCPI6_8)
-; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_8)
+; RV32-NEXT:    lui a1, %hi(.LCPI8_7)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_7)
+; RV32-NEXT:    lui a3, %hi(.LCPI8_8)
+; RV32-NEXT:    addi a3, a3, %lo(.LCPI8_8)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v16, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI6_9)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_9)
+; RV32-NEXT:    lui a1, %hi(.LCPI8_9)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_9)
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; RV32-NEXT:    vle16.v v8, (a3)
 ; RV32-NEXT:    csrr a3, vlenb
@@ -440,8 +490,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, %hi(.LCPI6_10)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_10)
+; RV32-NEXT:    lui a1, %hi(.LCPI8_10)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_10)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vle16.v v12, (a1)
 ; RV32-NEXT:    lui a1, 15
@@ -462,10 +512,10 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vmv4r.v v24, v16
-; RV32-NEXT:    lui a1, %hi(.LCPI6_11)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_11)
-; RV32-NEXT:    lui a3, %hi(.LCPI6_12)
-; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_12)
+; RV32-NEXT:    lui a1, %hi(.LCPI8_11)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_11)
+; RV32-NEXT:    lui a3, %hi(.LCPI8_12)
+; RV32-NEXT:    addi a3, a3, %lo(.LCPI8_12)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    vle16.v v28, (a1)
 ; RV32-NEXT:    vle16.v v4, (a3)
@@ -495,14 +545,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, %hi(.LCPI6_13)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_13)
-; RV32-NEXT:    lui a3, %hi(.LCPI6_14)
-; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_14)
+; RV32-NEXT:    lui a1, %hi(.LCPI8_13)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_13)
+; RV32-NEXT:    lui a3, %hi(.LCPI8_14)
+; RV32-NEXT:    addi a3, a3, %lo(.LCPI8_14)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v8, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI6_15)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_15)
+; RV32-NEXT:    lui a1, %hi(.LCPI8_15)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_15)
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; RV32-NEXT:    vle16.v v28, (a3)
 ; RV32-NEXT:    vle16.v v12, (a1)
@@ -1131,3 +1181,82 @@ define void @store_factor6(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2
   store <12 x i16> %interleaved.vec, ptr %ptr
   ret void
 }
+
+
+define <4 x i32> @load_factor2_one_active(ptr %ptr) {
+; CHECK-LABEL: load_factor2_one_active:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg2e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = load <8 x i32>, ptr %ptr
+  %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i32> %v0
+}
+
+
+define <4 x i32> @load_factor3_one_active(ptr %ptr) {
+; CHECK-LABEL: load_factor3_one_active:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg3e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = load <12 x i32>, ptr %ptr
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  ret <4 x i32> %v0
+}
+
+define <4 x i32> @load_factor4_one_active(ptr %ptr) {
+; CHECK-LABEL: load_factor4_one_active:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = load <16 x i32>, ptr %ptr
+  %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  ret <4 x i32> %v0
+}
+
+define <4 x i32> @load_factor5_one_active(ptr %ptr) {
+; CHECK-LABEL: load_factor5_one_active:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg5e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = load <20 x i32>, ptr %ptr
+  %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
+  ret <4 x i32> %v0
+}
+
+define <2 x i16> @load_factor6_one_active(ptr %ptr) {
+; CHECK-LABEL: load_factor6_one_active:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = load <12 x i16>, ptr %ptr
+  %v0 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> <i32 0, i32 6>
+  ret <2 x i16> %v0
+}
+
+define <4 x i8> @load_factor7_one_active(ptr %ptr) vscale_range(8,1024) {
+; CHECK-LABEL: load_factor7_one_active:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
+; CHECK-NEXT:    vlseg7e8.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = load <32 x i8>, ptr %ptr
+  %v0 = shufflevector <32 x i8> %interleaved.vec, <32 x i8> poison, <4 x i32> <i32 0, i32 7, i32 14, i32 21>
+  ret <4 x i8> %v0
+}
+
+define <4 x i8> @load_factor8_one_active(ptr %ptr) vscale_range(8,1024) {
+; CHECK-LABEL: load_factor8_one_active:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
+; CHECK-NEXT:    vlseg8e8.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = load <32 x i8>, ptr %ptr
+  %v0 = shufflevector <32 x i8> %interleaved.vec, <32 x i8> poison, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
+  ret <4 x i8> %v0
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 5802f45..bd55690 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -13487,7 +13487,6 @@ define <32 x i64> @mgather_strided_split(ptr %base) {
 ; RV32ZVE32F-NEXT:    vid.v v8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 4
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
 ; RV32ZVE32F-NEXT:    lw a3, 0(a1)
 ; RV32ZVE32F-NEXT:    sw a3, 252(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a1)
@@ -13587,10 +13586,10 @@ define <32 x i64> @mgather_strided_split(ptr %base) {
 ; RV32ZVE32F-NEXT:    lw s9, 4(a1)
 ; RV32ZVE32F-NEXT:    lw s10, 0(a2)
 ; RV32ZVE32F-NEXT:    lw s11, 4(a2)
-; RV32ZVE32F-NEXT:    lw t5, 0(a3)
-; RV32ZVE32F-NEXT:    lw t6, 4(a3)
-; RV32ZVE32F-NEXT:    lw s2, 0(a4)
-; RV32ZVE32F-NEXT:    lw s3, 4(a4)
+; RV32ZVE32F-NEXT:    lw s4, 0(a3)
+; RV32ZVE32F-NEXT:    lw s5, 4(a3)
+; RV32ZVE32F-NEXT:    lw s6, 0(a4)
+; RV32ZVE32F-NEXT:    lw s7, 4(a4)
 ; RV32ZVE32F-NEXT:    lw a2, 336(sp)
 ; RV32ZVE32F-NEXT:    lw a4, 340(sp)
 ; RV32ZVE32F-NEXT:    lw a5, 344(sp)
@@ -13607,8 +13606,8 @@ define <32 x i64> @mgather_strided_split(ptr %base) {
 ; RV32ZVE32F-NEXT:    lw a6, 356(sp)
 ; RV32ZVE32F-NEXT:    lw t3, 360(sp)
 ; RV32ZVE32F-NEXT:    lw t4, 364(sp)
-; RV32ZVE32F-NEXT:    lw s4, 0(a5)
-; RV32ZVE32F-NEXT:    sw s4, 116(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw t5, 0(a5)
+; RV32ZVE32F-NEXT:    sw t5, 116(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    sw a5, 112(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a5, 0(a6)
@@ -13626,10 +13625,10 @@ define <32 x i64> @mgather_strided_split(ptr %base) {
 ; RV32ZVE32F-NEXT:    lw a6, 372(sp)
 ; RV32ZVE32F-NEXT:    lw t3, 376(sp)
 ; RV32ZVE32F-NEXT:    lw t4, 380(sp)
-; RV32ZVE32F-NEXT:    lw s4, 0(a5)
-; RV32ZVE32F-NEXT:    lw s5, 4(a5)
-; RV32ZVE32F-NEXT:    lw s6, 0(a6)
-; RV32ZVE32F-NEXT:    lw s7, 4(a6)
+; RV32ZVE32F-NEXT:    lw t5, 0(a5)
+; RV32ZVE32F-NEXT:    lw t6, 4(a5)
+; RV32ZVE32F-NEXT:    lw s2, 0(a6)
+; RV32ZVE32F-NEXT:    lw s3, 4(a6)
 ; RV32ZVE32F-NEXT:    lw a5, 0(t3)
 ; RV32ZVE32F-NEXT:    lw a6, 4(t3)
 ; RV32ZVE32F-NEXT:    lw t3, 0(t4)
@@ -13642,10 +13641,10 @@ define <32 x i64> @mgather_strided_split(ptr %base) {
 ; RV32ZVE32F-NEXT:    sw t0, 164(a0)
 ; RV32ZVE32F-NEXT:    sw t1, 168(a0)
 ; RV32ZVE32F-NEXT:    sw t2, 172(a0)
-; RV32ZVE32F-NEXT:    sw t5, 144(a0)
-; RV32ZVE32F-NEXT:    sw t6, 148(a0)
-; RV32ZVE32F-NEXT:    sw s2, 152(a0)
-; RV32ZVE32F-NEXT:    sw s3, 156(a0)
+; RV32ZVE32F-NEXT:    sw s4, 144(a0)
+; RV32ZVE32F-NEXT:    sw s5, 148(a0)
+; RV32ZVE32F-NEXT:    sw s6, 152(a0)
+; RV32ZVE32F-NEXT:    sw s7, 156(a0)
 ; RV32ZVE32F-NEXT:    sw s8, 128(a0)
 ; RV32ZVE32F-NEXT:    sw s9, 132(a0)
 ; RV32ZVE32F-NEXT:    sw s10, 136(a0)
@@ -13686,10 +13685,10 @@ define <32 x i64> @mgather_strided_split(ptr %base) {
 ; RV32ZVE32F-NEXT:    sw a6, 244(a0)
 ; RV32ZVE32F-NEXT:    sw t3, 248(a0)
 ; RV32ZVE32F-NEXT:    sw t4, 252(a0)
-; RV32ZVE32F-NEXT:    sw s4, 224(a0)
-; RV32ZVE32F-NEXT:    sw s5, 228(a0)
-; RV32ZVE32F-NEXT:    sw s6, 232(a0)
-; RV32ZVE32F-NEXT:    sw s7, 236(a0)
+; RV32ZVE32F-NEXT:    sw t5, 224(a0)
+; RV32ZVE32F-NEXT:    sw t6, 228(a0)
+; RV32ZVE32F-NEXT:    sw s2, 232(a0)
+; RV32ZVE32F-NEXT:    sw s3, 236(a0)
 ; RV32ZVE32F-NEXT:    sw ra, 208(a0)
 ; RV32ZVE32F-NEXT:    lw a1, 108(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    sw a1, 212(a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll
index 80110b3..683ead4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
-define void @masked_store_v1f16(<1 x half>* %val_ptr, <1 x half>* %a, <1 x half>* %m_ptr) nounwind {
+define void @masked_store_v1f16(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; CHECK-LABEL: masked_store_v1f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
@@ -12,15 +12,15 @@ define void @masked_store_v1f16(<1 x half>* %val_ptr, <1 x half>* %a, <1 x half>
 ; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
 ; CHECK-NEXT:    vse16.v v9, (a1), v0.t
 ; CHECK-NEXT:    ret
-  %m = load <1 x half>, <1 x half>* %m_ptr
+  %m = load <1 x half>, ptr %m_ptr
   %mask = fcmp oeq <1 x half> %m, zeroinitializer
-  %val = load <1 x half>, <1 x half>* %val_ptr
-  call void @llvm.masked.store.v1f16.p0v1f16(<1 x half> %val, <1 x half>* %a, i32 8, <1 x i1> %mask)
+  %val = load <1 x half>, ptr %val_ptr
+  call void @llvm.masked.store.v1f16.p0v1f16(<1 x half> %val, ptr %a, i32 8, <1 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.store.v1f16.p0v1f16(<1 x half>, <1 x half>*, i32, <1 x i1>)
+declare void @llvm.masked.store.v1f16.p0v1f16(<1 x half>, ptr, i32, <1 x i1>)
 
-define void @masked_store_v1f32(<1 x float>* %val_ptr, <1 x float>* %a, <1 x float>* %m_ptr) nounwind {
+define void @masked_store_v1f32(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; CHECK-LABEL: masked_store_v1f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -30,15 +30,15 @@ define void @masked_store_v1f32(<1 x float>* %val_ptr, <1 x float>* %a, <1 x flo
 ; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
 ; CHECK-NEXT:    vse32.v v9, (a1), v0.t
 ; CHECK-NEXT:    ret
-  %m = load <1 x float>, <1 x float>* %m_ptr
+  %m = load <1 x float>, ptr %m_ptr
   %mask = fcmp oeq <1 x float> %m, zeroinitializer
-  %val = load <1 x float>, <1 x float>* %val_ptr
-  call void @llvm.masked.store.v1f32.p0v1f32(<1 x float> %val, <1 x float>* %a, i32 8, <1 x i1> %mask)
+  %val = load <1 x float>, ptr %val_ptr
+  call void @llvm.masked.store.v1f32.p0v1f32(<1 x float> %val, ptr %a, i32 8, <1 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.store.v1f32.p0v1f32(<1 x float>, <1 x float>*, i32, <1 x i1>)
+declare void @llvm.masked.store.v1f32.p0v1f32(<1 x float>, ptr, i32, <1 x i1>)
 
-define void @masked_store_v1f64(<1 x double>* %val_ptr, <1 x double>* %a, <1 x double>* %m_ptr) nounwind {
+define void @masked_store_v1f64(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; RV32-LABEL: masked_store_v1f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -58,15 +58,15 @@ define void @masked_store_v1f64(<1 x double>* %val_ptr, <1 x double>* %a, <1 x d
 ; RV64-NEXT:    vmfeq.vf v0, v8, fa5
 ; RV64-NEXT:    vse64.v v9, (a1), v0.t
 ; RV64-NEXT:    ret
-  %m = load <1 x double>, <1 x double>* %m_ptr
+  %m = load <1 x double>, ptr %m_ptr
   %mask = fcmp oeq <1 x double> %m, zeroinitializer
-  %val = load <1 x double>, <1 x double>* %val_ptr
-  call void @llvm.masked.store.v1f64.p0v1f64(<1 x double> %val, <1 x double>* %a, i32 8, <1 x i1> %mask)
+  %val = load <1 x double>, ptr %val_ptr
+  call void @llvm.masked.store.v1f64.p0v1f64(<1 x double> %val, ptr %a, i32 8, <1 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.store.v1f64.p0v1f64(<1 x double>, <1 x double>*, i32, <1 x i1>)
+declare void @llvm.masked.store.v1f64.p0v1f64(<1 x double>, ptr, i32, <1 x i1>)
 
-define void @masked_store_v2f16(<2 x half>* %val_ptr, <2 x half>* %a, <2 x half>* %m_ptr) nounwind {
+define void @masked_store_v2f16(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; CHECK-LABEL: masked_store_v2f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
@@ -76,15 +76,15 @@ define void @masked_store_v2f16(<2 x half>* %val_ptr, <2 x half>* %a, <2 x half>
 ; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
 ; CHECK-NEXT:    vse16.v v9, (a1), v0.t
 ; CHECK-NEXT:    ret
-  %m = load <2 x half>, <2 x half>* %m_ptr
+  %m = load <2 x half>, ptr %m_ptr
   %mask = fcmp oeq <2 x half> %m, zeroinitializer
-  %val = load <2 x half>, <2 x half>* %val_ptr
-  call void @llvm.masked.store.v2f16.p0v2f16(<2 x half> %val, <2 x half>* %a, i32 8, <2 x i1> %mask)
+  %val = load <2 x half>, ptr %val_ptr
+  call void @llvm.masked.store.v2f16.p0v2f16(<2 x half> %val, ptr %a, i32 8, <2 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.store.v2f16.p0v2f16(<2 x half>, <2 x half>*, i32, <2 x i1>)
+declare void @llvm.masked.store.v2f16.p0v2f16(<2 x half>, ptr, i32, <2 x i1>)
 
-define void @masked_store_v2f32(<2 x float>* %val_ptr, <2 x float>* %a, <2 x float>* %m_ptr) nounwind {
+define void @masked_store_v2f32(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; CHECK-LABEL: masked_store_v2f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -94,15 +94,15 @@ define void @masked_store_v2f32(<2 x float>* %val_ptr, <2 x float>* %a, <2 x flo
 ; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
 ; CHECK-NEXT:    vse32.v v9, (a1), v0.t
 ; CHECK-NEXT:    ret
-  %m = load <2 x float>, <2 x float>* %m_ptr
+  %m = load <2 x float>, ptr %m_ptr
   %mask = fcmp oeq <2 x float> %m, zeroinitializer
-  %val = load <2 x float>, <2 x float>* %val_ptr
-  call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %a, i32 8, <2 x i1> %mask)
+  %val = load <2 x float>, ptr %val_ptr
+  call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, ptr %a, i32 8, <2 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.store.v2f32.p0v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
+declare void @llvm.masked.store.v2f32.p0v2f32(<2 x float>, ptr, i32, <2 x i1>)
 
-define void @masked_store_v2f64(<2 x double>* %val_ptr, <2 x double>* %a, <2 x double>* %m_ptr) nounwind {
+define void @masked_store_v2f64(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; RV32-LABEL: masked_store_v2f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
@@ -122,15 +122,15 @@ define void @masked_store_v2f64(<2 x double>* %val_ptr, <2 x double>* %a, <2 x d
 ; RV64-NEXT:    vmfeq.vf v0, v8, fa5
 ; RV64-NEXT:    vse64.v v9, (a1), v0.t
 ; RV64-NEXT:    ret
-  %m = load <2 x double>, <2 x double>* %m_ptr
+  %m = load <2 x double>, ptr %m_ptr
   %mask = fcmp oeq <2 x double> %m, zeroinitializer
-  %val = load <2 x double>, <2 x double>* %val_ptr
-  call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %val, <2 x double>* %a, i32 8, <2 x i1> %mask)
+  %val = load <2 x double>, ptr %val_ptr
+  call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %val, ptr %a, i32 8, <2 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
+declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, ptr, i32, <2 x i1>)
 
-define void @masked_store_v4f16(<4 x half>* %val_ptr, <4 x half>* %a, <4 x half>* %m_ptr) nounwind {
+define void @masked_store_v4f16(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; CHECK-LABEL: masked_store_v4f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
@@ -140,15 +140,15 @@ define void @masked_store_v4f16(<4 x half>* %val_ptr, <4 x half>* %a, <4 x half>
 ; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
 ; CHECK-NEXT:    vse16.v v9, (a1), v0.t
 ; CHECK-NEXT:    ret
-  %m = load <4 x half>, <4 x half>* %m_ptr
+  %m = load <4 x half>, ptr %m_ptr
   %mask = fcmp oeq <4 x half> %m, zeroinitializer
-  %val = load <4 x half>, <4 x half>* %val_ptr
-  call void @llvm.masked.store.v4f16.p0v4f16(<4 x half> %val, <4 x half>* %a, i32 8, <4 x i1> %mask)
+  %val = load <4 x half>, ptr %val_ptr
+  call void @llvm.masked.store.v4f16.p0v4f16(<4 x half> %val, ptr %a, i32 8, <4 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.store.v4f16.p0v4f16(<4 x half>, <4 x half>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v4f16.p0v4f16(<4 x half>, ptr, i32, <4 x i1>)
 
-define void @masked_store_v4f32(<4 x float>* %val_ptr, <4 x float>* %a, <4 x float>* %m_ptr) nounwind {
+define void @masked_store_v4f32(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; CHECK-LABEL: masked_store_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
@@ -158,15 +158,15 @@ define void @masked_store_v4f32(<4 x float>* %val_ptr, <4 x float>* %a, <4 x flo
 ; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
 ; CHECK-NEXT:    vse32.v v9, (a1), v0.t
 ; CHECK-NEXT:    ret
-  %m = load <4 x float>, <4 x float>* %m_ptr
+  %m = load <4 x float>, ptr %m_ptr
   %mask = fcmp oeq <4 x float> %m, zeroinitializer
-  %val = load <4 x float>, <4 x float>* %val_ptr
-  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %a, i32 8, <4 x i1> %mask)
+  %val = load <4 x float>, ptr %val_ptr
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, ptr %a, i32 8, <4 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, ptr, i32, <4 x i1>)
 
-define void @masked_store_v4f64(<4 x double>* %val_ptr, <4 x double>* %a, <4 x double>* %m_ptr) nounwind {
+define void @masked_store_v4f64(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; RV32-LABEL: masked_store_v4f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
@@ -186,15 +186,15 @@ define void @masked_store_v4f64(<4 x double>* %val_ptr, <4 x double>* %a, <4 x d
 ; RV64-NEXT:    vmfeq.vf v0, v8, fa5
 ; RV64-NEXT:    vse64.v v10, (a1), v0.t
 ; RV64-NEXT:    ret
-  %m = load <4 x double>, <4 x double>* %m_ptr
+  %m = load <4 x double>, ptr %m_ptr
   %mask = fcmp oeq <4 x double> %m, zeroinitializer
-  %val = load <4 x double>, <4 x double>* %val_ptr
-  call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %a, i32 8, <4 x i1> %mask)
+  %val = load <4 x double>, ptr %val_ptr
+  call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, ptr %a, i32 8, <4 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.store.v4f64.p0v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v4f64.p0v4f64(<4 x double>, ptr, i32, <4 x i1>)
 
-define void @masked_store_v8f16(<8 x half>* %val_ptr, <8 x half>* %a, <8 x half>* %m_ptr) nounwind {
+define void @masked_store_v8f16(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; CHECK-LABEL: masked_store_v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
@@ -204,15 +204,15 @@ define void @masked_store_v8f16(<8 x half>* %val_ptr, <8 x half>* %a, <8 x half>
 ; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
 ; CHECK-NEXT:    vse16.v v9, (a1), v0.t
 ; CHECK-NEXT:    ret
-  %m = load <8 x half>, <8 x half>* %m_ptr
+  %m = load <8 x half>, ptr %m_ptr
   %mask = fcmp oeq <8 x half> %m, zeroinitializer
-  %val = load <8 x half>, <8 x half>* %val_ptr
-  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %val, <8 x half>* %a, i32 8, <8 x i1> %mask)
+  %val = load <8 x half>, ptr %val_ptr
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %val, ptr %a, i32 8, <8 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, ptr, i32, <8 x i1>)
 
-define void @masked_store_v8f32(<8 x float>* %val_ptr, <8 x float>* %a, <8 x float>* %m_ptr) nounwind {
+define void @masked_store_v8f32(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; CHECK-LABEL: masked_store_v8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
@@ -222,15 +222,15 @@ define void @masked_store_v8f32(<8 x float>* %val_ptr, <8 x float>* %a, <8 x flo
 ; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
 ; CHECK-NEXT:    vse32.v v10, (a1), v0.t
 ; CHECK-NEXT:    ret
-  %m = load <8 x float>, <8 x float>* %m_ptr
+  %m = load <8 x float>, ptr %m_ptr
   %mask = fcmp oeq <8 x float> %m, zeroinitializer
-  %val = load <8 x float>, <8 x float>* %val_ptr
-  call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> %val, <8 x float>* %a, i32 8, <8 x i1> %mask)
+  %val = load <8 x float>, ptr %val_ptr
+  call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> %val, ptr %a, i32 8, <8 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.store.v8f32.p0v8f32(<8 x float>, <8 x float>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v8f32.p0v8f32(<8 x float>, ptr, i32, <8 x i1>)
 
-define void @masked_store_v8f64(<8 x double>* %val_ptr, <8 x double>* %a, <8 x double>* %m_ptr) nounwind {
+define void @masked_store_v8f64(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; RV32-LABEL: masked_store_v8f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
@@ -250,15 +250,15 @@ define void @masked_store_v8f64(<8 x double>* %val_ptr, <8 x double>* %a, <8 x d
 ; RV64-NEXT:    vmfeq.vf v0, v8, fa5
 ; RV64-NEXT:    vse64.v v12, (a1), v0.t
 ; RV64-NEXT:    ret
-  %m = load <8 x double>, <8 x double>* %m_ptr
+  %m = load <8 x double>, ptr %m_ptr
   %mask = fcmp oeq <8 x double> %m, zeroinitializer
-  %val = load <8 x double>, <8 x double>* %val_ptr
-  call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %a, i32 8, <8 x i1> %mask)
+  %val = load <8 x double>, ptr %val_ptr
+  call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, ptr %a, i32 8, <8 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, ptr, i32, <8 x i1>)
 
-define void @masked_store_v16f16(<16 x half>* %val_ptr, <16 x half>* %a, <16 x half>* %m_ptr) nounwind {
+define void @masked_store_v16f16(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; CHECK-LABEL: masked_store_v16f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
@@ -268,15 +268,15 @@ define void @masked_store_v16f16(<16 x half>* %val_ptr, <16 x half>* %a, <16 x h
 ; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
 ; CHECK-NEXT:    vse16.v v10, (a1), v0.t
 ; CHECK-NEXT:    ret
-  %m = load <16 x half>, <16 x half>* %m_ptr
+  %m = load <16 x half>, ptr %m_ptr
   %mask = fcmp oeq <16 x half> %m, zeroinitializer
-  %val = load <16 x half>, <16 x half>* %val_ptr
-  call void @llvm.masked.store.v16f16.p0v16f16(<16 x half> %val, <16 x half>* %a, i32 8, <16 x i1> %mask)
+  %val = load <16 x half>, ptr %val_ptr
+  call void @llvm.masked.store.v16f16.p0v16f16(<16 x half> %val, ptr %a, i32 8, <16 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.store.v16f16.p0v16f16(<16 x half>, <16 x half>*, i32, <16 x i1>)
+declare void @llvm.masked.store.v16f16.p0v16f16(<16 x half>, ptr, i32, <16 x i1>)
 
-define void @masked_store_v16f32(<16 x float>* %val_ptr, <16 x float>* %a, <16 x float>* %m_ptr) nounwind {
+define void @masked_store_v16f32(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; CHECK-LABEL: masked_store_v16f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
@@ -286,15 +286,15 @@ define void @masked_store_v16f32(<16 x float>* %val_ptr, <16 x float>* %a, <16 x
 ; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
 ; CHECK-NEXT:    vse32.v v12, (a1), v0.t
 ; CHECK-NEXT:    ret
-  %m = load <16 x float>, <16 x float>* %m_ptr
+  %m = load <16 x float>, ptr %m_ptr
   %mask = fcmp oeq <16 x float> %m, zeroinitializer
-  %val = load <16 x float>, <16 x float>* %val_ptr
-  call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> %val, <16 x float>* %a, i32 8, <16 x i1> %mask)
+  %val = load <16 x float>, ptr %val_ptr
+  call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> %val, ptr %a, i32 8, <16 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
+declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, ptr, i32, <16 x i1>)
 
-define void @masked_store_v16f64(<16 x double>* %val_ptr, <16 x double>* %a, <16 x double>* %m_ptr) nounwind {
+define void @masked_store_v16f64(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; RV32-LABEL: masked_store_v16f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
@@ -314,15 +314,15 @@ define void @masked_store_v16f64(<16 x double>* %val_ptr, <16 x double>* %a, <16
 ; RV64-NEXT:    vmfeq.vf v0, v8, fa5
 ; RV64-NEXT:    vse64.v v16, (a1), v0.t
 ; RV64-NEXT:    ret
-  %m = load <16 x double>, <16 x double>* %m_ptr
+  %m = load <16 x double>, ptr %m_ptr
   %mask = fcmp oeq <16 x double> %m, zeroinitializer
-  %val = load <16 x double>, <16 x double>* %val_ptr
-  call void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %val, <16 x double>* %a, i32 8, <16 x i1> %mask)
+  %val = load <16 x double>, ptr %val_ptr
+  call void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %val, ptr %a, i32 8, <16 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.store.v16f64.p0v16f64(<16 x double>, <16 x double>*, i32, <16 x i1>)
+declare void @llvm.masked.store.v16f64.p0v16f64(<16 x double>, ptr, i32, <16 x i1>)
 
-define void @masked_store_v32f16(<32 x half>* %val_ptr, <32 x half>* %a, <32 x half>* %m_ptr) nounwind {
+define void @masked_store_v32f16(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; CHECK-LABEL: masked_store_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a3, 32
@@ -333,15 +333,15 @@ define void @masked_store_v32f16(<32 x half>* %val_ptr, <32 x half>* %a, <32 x h
 ; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
 ; CHECK-NEXT:    vse16.v v12, (a1), v0.t
 ; CHECK-NEXT:    ret
-  %m = load <32 x half>, <32 x half>* %m_ptr
+  %m = load <32 x half>, ptr %m_ptr
   %mask = fcmp oeq <32 x half> %m, zeroinitializer
-  %val = load <32 x half>, <32 x half>* %val_ptr
-  call void @llvm.masked.store.v32f16.p0v32f16(<32 x half> %val, <32 x half>* %a, i32 8, <32 x i1> %mask)
+  %val = load <32 x half>, ptr %val_ptr
+  call void @llvm.masked.store.v32f16.p0v32f16(<32 x half> %val, ptr %a, i32 8, <32 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.store.v32f16.p0v32f16(<32 x half>, <32 x half>*, i32, <32 x i1>)
+declare void @llvm.masked.store.v32f16.p0v32f16(<32 x half>, ptr, i32, <32 x i1>)
 
-define void @masked_store_v32f32(<32 x float>* %val_ptr, <32 x float>* %a, <32 x float>* %m_ptr) nounwind {
+define void @masked_store_v32f32(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; CHECK-LABEL: masked_store_v32f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a3, 32
@@ -352,15 +352,15 @@ define void @masked_store_v32f32(<32 x float>* %val_ptr, <32 x float>* %a, <32 x
 ; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
 ; CHECK-NEXT:    vse32.v v16, (a1), v0.t
 ; CHECK-NEXT:    ret
-  %m = load <32 x float>, <32 x float>* %m_ptr
+  %m = load <32 x float>, ptr %m_ptr
   %mask = fcmp oeq <32 x float> %m, zeroinitializer
-  %val = load <32 x float>, <32 x float>* %val_ptr
-  call void @llvm.masked.store.v32f32.p0v32f32(<32 x float> %val, <32 x float>* %a, i32 8, <32 x i1> %mask)
+  %val = load <32 x float>, ptr %val_ptr
+  call void @llvm.masked.store.v32f32.p0v32f32(<32 x float> %val, ptr %a, i32 8, <32 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.store.v32f32.p0v32f32(<32 x float>, <32 x float>*, i32, <32 x i1>)
+declare void @llvm.masked.store.v32f32.p0v32f32(<32 x float>, ptr, i32, <32 x i1>)
 
-define void @masked_store_v32f64(<32 x double>* %val_ptr, <32 x double>* %a, <32 x double>* %m_ptr) nounwind {
+define void @masked_store_v32f64(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; RV32-LABEL: masked_store_v32f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
@@ -440,15 +440,15 @@ define void @masked_store_v32f64(<32 x double>* %val_ptr, <32 x double>* %a, <32
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
-  %m = load <32 x double>, <32 x double>* %m_ptr
+  %m = load <32 x double>, ptr %m_ptr
   %mask = fcmp oeq <32 x double> %m, zeroinitializer
-  %val = load <32 x double>, <32 x double>* %val_ptr
-  call void @llvm.masked.store.v32f32.p0v32f64(<32 x double> %val, <32 x double>* %a, i32 8, <32 x i1> %mask)
+  %val = load <32 x double>, ptr %val_ptr
+  call void @llvm.masked.store.v32f32.p0v32f64(<32 x double> %val, ptr %a, i32 8, <32 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.store.v32f32.p0v32f64(<32 x double>, <32 x double>*, i32, <32 x i1>)
+declare void @llvm.masked.store.v32f32.p0v32f64(<32 x double>, ptr, i32, <32 x i1>)
 
-define void @masked_store_v64f16(<64 x half>* %val_ptr, <64 x half>* %a, <64 x half>* %m_ptr) nounwind {
+define void @masked_store_v64f16(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; CHECK-LABEL: masked_store_v64f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a3, 64
@@ -459,15 +459,15 @@ define void @masked_store_v64f16(<64 x half>* %val_ptr, <64 x half>* %a, <64 x h
 ; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
 ; CHECK-NEXT:    vse16.v v16, (a1), v0.t
 ; CHECK-NEXT:    ret
-  %m = load <64 x half>, <64 x half>* %m_ptr
+  %m = load <64 x half>, ptr %m_ptr
   %mask = fcmp oeq <64 x half> %m, zeroinitializer
-  %val = load <64 x half>, <64 x half>* %val_ptr
-  call void @llvm.masked.store.v64f16.p0v64f16(<64 x half> %val, <64 x half>* %a, i32 8, <64 x i1> %mask)
+  %val = load <64 x half>, ptr %val_ptr
+  call void @llvm.masked.store.v64f16.p0v64f16(<64 x half> %val, ptr %a, i32 8, <64 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.store.v64f16.p0v64f16(<64 x half>, <64 x half>*, i32, <64 x i1>)
+declare void @llvm.masked.store.v64f16.p0v64f16(<64 x half>, ptr, i32, <64 x i1>)
 
-define void @masked_store_v64f32(<64 x float>* %val_ptr, <64 x float>* %a, <64 x float>* %m_ptr) nounwind {
+define void @masked_store_v64f32(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; CHECK-LABEL: masked_store_v64f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
@@ -508,15 +508,15 @@ define void @masked_store_v64f32(<64 x float>* %val_ptr, <64 x float>* %a, <64 x
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-  %m = load <64 x float>, <64 x float>* %m_ptr
+  %m = load <64 x float>, ptr %m_ptr
   %mask = fcmp oeq <64 x float> %m, zeroinitializer
-  %val = load <64 x float>, <64 x float>* %val_ptr
-  call void @llvm.masked.store.v64f16.p0v64f32(<64 x float> %val, <64 x float>* %a, i32 8, <64 x i1> %mask)
+  %val = load <64 x float>, ptr %val_ptr
+  call void @llvm.masked.store.v64f16.p0v64f32(<64 x float> %val, ptr %a, i32 8, <64 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.store.v64f16.p0v64f32(<64 x float>, <64 x float>*, i32, <64 x i1>)
+declare void @llvm.masked.store.v64f16.p0v64f32(<64 x float>, ptr, i32, <64 x i1>)
 
-define void @masked_store_v128f16(<128 x half>* %val_ptr, <128 x half>* %a, <128 x half>* %m_ptr) nounwind {
+define void @masked_store_v128f16(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
 ; CHECK-LABEL: masked_store_v128f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
@@ -557,10 +557,10 @@ define void @masked_store_v128f16(<128 x half>* %val_ptr, <128 x half>* %a, <128
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-  %m = load <128 x half>, <128 x half>* %m_ptr
+  %m = load <128 x half>, ptr %m_ptr
   %mask = fcmp oeq <128 x half> %m, zeroinitializer
-  %val = load <128 x half>, <128 x half>* %val_ptr
-  call void @llvm.masked.store.v128f16.p0v128f16(<128 x half> %val, <128 x half>* %a, i32 8, <128 x i1> %mask)
+  %val = load <128 x half>, ptr %val_ptr
+  call void @llvm.masked.store.v128f16.p0v128f16(<128 x half> %val, ptr %a, i32 8, <128 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.store.v128f16.p0v128f16(<128 x half>, <128 x half>*, i32, <128 x i1>)
+declare void @llvm.masked.store.v128f16.p0v128f16(<128 x half>, ptr, i32, <128 x i1>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
index 8e2a225..9e76d72 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
@@ -1116,8 +1116,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    .cfi_def_cfa_offset 768
 ; ZVFHMIN32-NEXT:    sw ra, 764(sp) # 4-byte Folded Spill
 ; ZVFHMIN32-NEXT:    sw s0, 760(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s2, 756(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s3, 752(sp) # 4-byte Folded Spill
 ; ZVFHMIN32-NEXT:    .cfi_offset ra, -4
 ; ZVFHMIN32-NEXT:    .cfi_offset s0, -8
+; ZVFHMIN32-NEXT:    .cfi_offset s2, -12
+; ZVFHMIN32-NEXT:    .cfi_offset s3, -16
 ; ZVFHMIN32-NEXT:    addi s0, sp, 768
 ; ZVFHMIN32-NEXT:    .cfi_def_cfa s0, 0
 ; ZVFHMIN32-NEXT:    andi sp, sp, -128
@@ -1126,526 +1130,844 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
 ; ZVFHMIN32-NEXT:    vle16.v v24, (a1)
 ; ZVFHMIN32-NEXT:    vle16.v v0, (a0)
-; ZVFHMIN32-NEXT:    addi a0, sp, 128
+; ZVFHMIN32-NEXT:    addi a0, sp, 512
 ; ZVFHMIN32-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN32-NEXT:    addi a0, sp, 384
-; ZVFHMIN32-NEXT:    vse16.v v0, (a0)
 ; ZVFHMIN32-NEXT:    addi a0, sp, 256
+; ZVFHMIN32-NEXT:    vse16.v v0, (a0)
+; ZVFHMIN32-NEXT:    addi a0, sp, 384
 ; ZVFHMIN32-NEXT:    vse16.v v16, (a0)
-; ZVFHMIN32-NEXT:    addi a0, sp, 512
+; ZVFHMIN32-NEXT:    addi a0, sp, 128
 ; ZVFHMIN32-NEXT:    vse16.v v24, (a0)
-; ZVFHMIN32-NEXT:    flh fa5, 254(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 510(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 63(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 252(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 508(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 62(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 250(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 506(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 61(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 248(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 504(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 60(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 246(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 502(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 59(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 244(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 500(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 58(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 242(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 498(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 57(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 240(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 496(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 56(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 238(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 494(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 55(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 236(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 492(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 54(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 234(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 490(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 53(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 232(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 488(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 52(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 230(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 486(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 51(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 228(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 484(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 50(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 226(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 482(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 49(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 224(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 480(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 48(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 222(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 478(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 47(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 382(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 638(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 127(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 380(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 636(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 126(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 378(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 634(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 125(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 376(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 632(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 124(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 374(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 630(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 123(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 372(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 628(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 122(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 370(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 626(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 121(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 368(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 624(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 120(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 366(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 622(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 119(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 364(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 620(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 118(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 362(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 618(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 117(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 360(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 616(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 116(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 358(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 614(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 115(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 356(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 612(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 114(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 354(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 610(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 113(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 352(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 608(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 112(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 350(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 606(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 111(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 220(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 476(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 46(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 218(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 474(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 45(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 216(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 472(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 44(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 214(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 470(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 43(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 212(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 468(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 42(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 210(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 466(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 41(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 208(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 464(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 40(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 206(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 462(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 39(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 204(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 460(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 38(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 202(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 458(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 37(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 200(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 456(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 36(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 198(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 454(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 35(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 196(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 452(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 34(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 194(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 450(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 33(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 192(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 448(sp)
+; ZVFHMIN32-NEXT:    lh a0, 576(sp)
+; ZVFHMIN32-NEXT:    lh a1, 320(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 32(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 190(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 446(sp)
+; ZVFHMIN32-NEXT:    lh a0, 574(sp)
+; ZVFHMIN32-NEXT:    lh a1, 318(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 31(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 188(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 444(sp)
+; ZVFHMIN32-NEXT:    lh a0, 572(sp)
+; ZVFHMIN32-NEXT:    lh a1, 316(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 30(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 348(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 604(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 110(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 346(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 602(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 109(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 344(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 600(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 108(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 342(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 598(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 107(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 340(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 596(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 106(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 338(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 594(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 105(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 336(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 592(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 104(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 334(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 590(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 103(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 332(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 588(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 102(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 330(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 586(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 101(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 328(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 584(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 100(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 326(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 582(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 99(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 324(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 580(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 98(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 322(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 578(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 97(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 320(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 576(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 96(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 318(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 574(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 95(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 316(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 572(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 94(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 186(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 442(sp)
+; ZVFHMIN32-NEXT:    lh a0, 570(sp)
+; ZVFHMIN32-NEXT:    lh a1, 314(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 29(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 184(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 440(sp)
+; ZVFHMIN32-NEXT:    lh a0, 568(sp)
+; ZVFHMIN32-NEXT:    lh a1, 312(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 28(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 182(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 438(sp)
+; ZVFHMIN32-NEXT:    lh a0, 566(sp)
+; ZVFHMIN32-NEXT:    lh a1, 310(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 27(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 180(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 436(sp)
+; ZVFHMIN32-NEXT:    lh a0, 564(sp)
+; ZVFHMIN32-NEXT:    lh a1, 308(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 26(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 178(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 434(sp)
+; ZVFHMIN32-NEXT:    lh a0, 562(sp)
+; ZVFHMIN32-NEXT:    lh a1, 306(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 25(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 176(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 432(sp)
+; ZVFHMIN32-NEXT:    lh a0, 560(sp)
+; ZVFHMIN32-NEXT:    lh a1, 304(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 24(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 174(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 430(sp)
+; ZVFHMIN32-NEXT:    lh a0, 558(sp)
+; ZVFHMIN32-NEXT:    lh a1, 302(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 23(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 172(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 428(sp)
+; ZVFHMIN32-NEXT:    lh a0, 556(sp)
+; ZVFHMIN32-NEXT:    lh a1, 300(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 22(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 170(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 426(sp)
+; ZVFHMIN32-NEXT:    lh a0, 554(sp)
+; ZVFHMIN32-NEXT:    lh a1, 298(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 21(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 168(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 424(sp)
+; ZVFHMIN32-NEXT:    lh a0, 552(sp)
+; ZVFHMIN32-NEXT:    lh a1, 296(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 20(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 166(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 422(sp)
+; ZVFHMIN32-NEXT:    lh a0, 550(sp)
+; ZVFHMIN32-NEXT:    lh a1, 294(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 19(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 164(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 420(sp)
+; ZVFHMIN32-NEXT:    lh a0, 548(sp)
+; ZVFHMIN32-NEXT:    lh a1, 292(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 18(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 162(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 418(sp)
+; ZVFHMIN32-NEXT:    lh a0, 546(sp)
+; ZVFHMIN32-NEXT:    lh a1, 290(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 17(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 160(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 416(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    lh a0, 544(sp)
+; ZVFHMIN32-NEXT:    lh a1, 288(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    vmv.x.s a1, v0
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
+; ZVFHMIN32-NEXT:    sb a1, 0(sp)
 ; ZVFHMIN32-NEXT:    sb a0, 16(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 158(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 414(sp)
+; ZVFHMIN32-NEXT:    lh a0, 448(sp)
+; ZVFHMIN32-NEXT:    lh a1, 192(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 15(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 156(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 412(sp)
+; ZVFHMIN32-NEXT:    sb a0, 96(sp)
+; ZVFHMIN32-NEXT:    lh a0, 446(sp)
+; ZVFHMIN32-NEXT:    lh a1, 190(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 14(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 154(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 410(sp)
+; ZVFHMIN32-NEXT:    sb a0, 95(sp)
+; ZVFHMIN32-NEXT:    lh a0, 444(sp)
+; ZVFHMIN32-NEXT:    lh a1, 188(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 13(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 314(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 570(sp)
+; ZVFHMIN32-NEXT:    sb a0, 94(sp)
+; ZVFHMIN32-NEXT:    lh a0, 442(sp)
+; ZVFHMIN32-NEXT:    lh a1, 186(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 93(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 312(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 568(sp)
+; ZVFHMIN32-NEXT:    lh a0, 440(sp)
+; ZVFHMIN32-NEXT:    lh a1, 184(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 92(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 310(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 566(sp)
+; ZVFHMIN32-NEXT:    lh a0, 438(sp)
+; ZVFHMIN32-NEXT:    lh a1, 182(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 91(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 308(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 564(sp)
+; ZVFHMIN32-NEXT:    lh a0, 436(sp)
+; ZVFHMIN32-NEXT:    lh a1, 180(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 90(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 306(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 562(sp)
+; ZVFHMIN32-NEXT:    lh a0, 434(sp)
+; ZVFHMIN32-NEXT:    lh a1, 178(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 89(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 304(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 560(sp)
+; ZVFHMIN32-NEXT:    lh a0, 432(sp)
+; ZVFHMIN32-NEXT:    lh a1, 176(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 88(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 302(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 558(sp)
+; ZVFHMIN32-NEXT:    lh a0, 430(sp)
+; ZVFHMIN32-NEXT:    lh a1, 174(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 87(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 300(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 556(sp)
+; ZVFHMIN32-NEXT:    lh a0, 428(sp)
+; ZVFHMIN32-NEXT:    lh a1, 172(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 86(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 298(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 554(sp)
+; ZVFHMIN32-NEXT:    lh a0, 426(sp)
+; ZVFHMIN32-NEXT:    lh a1, 170(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 85(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 296(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 552(sp)
+; ZVFHMIN32-NEXT:    lh a0, 424(sp)
+; ZVFHMIN32-NEXT:    lh a1, 168(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 84(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 294(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 550(sp)
+; ZVFHMIN32-NEXT:    lh a0, 422(sp)
+; ZVFHMIN32-NEXT:    lh a1, 166(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 83(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 292(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 548(sp)
+; ZVFHMIN32-NEXT:    lh a0, 420(sp)
+; ZVFHMIN32-NEXT:    lh a1, 164(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 82(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 290(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 546(sp)
+; ZVFHMIN32-NEXT:    lh a0, 418(sp)
+; ZVFHMIN32-NEXT:    lh a1, 162(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 81(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 288(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 544(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    lh a0, 416(sp)
+; ZVFHMIN32-NEXT:    lh a1, 160(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    vmv.x.s a1, v24
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    vmv.x.s a1, v16
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
+; ZVFHMIN32-NEXT:    sb a1, 64(sp)
 ; ZVFHMIN32-NEXT:    sb a0, 80(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 286(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 542(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 79(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 284(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 540(sp)
+; ZVFHMIN32-NEXT:    lh a0, 610(sp)
+; ZVFHMIN32-NEXT:    lh a1, 354(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 78(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 282(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 538(sp)
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 77(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 152(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 408(sp)
+; ZVFHMIN32-NEXT:    sb a0, 49(sp)
+; ZVFHMIN32-NEXT:    lh a0, 608(sp)
+; ZVFHMIN32-NEXT:    lh a1, 352(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 12(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 150(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 406(sp)
+; ZVFHMIN32-NEXT:    sb a0, 48(sp)
+; ZVFHMIN32-NEXT:    lh a0, 606(sp)
+; ZVFHMIN32-NEXT:    lh a1, 350(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 11(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 148(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 404(sp)
+; ZVFHMIN32-NEXT:    sb a0, 47(sp)
+; ZVFHMIN32-NEXT:    lh a1, 604(sp)
+; ZVFHMIN32-NEXT:    lh a2, 348(sp)
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 7
+; ZVFHMIN32-NEXT:    vmv.x.s a0, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 46(sp)
+; ZVFHMIN32-NEXT:    lh a2, 602(sp)
+; ZVFHMIN32-NEXT:    lh a3, 346(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 7
+; ZVFHMIN32-NEXT:    vmv.x.s a1, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a2, 45(sp)
+; ZVFHMIN32-NEXT:    lh a3, 600(sp)
+; ZVFHMIN32-NEXT:    lh a4, 344(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 6
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a3
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a3, 44(sp)
+; ZVFHMIN32-NEXT:    lh a4, 598(sp)
+; ZVFHMIN32-NEXT:    lh a5, 342(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 6
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a4, 43(sp)
+; ZVFHMIN32-NEXT:    lh a5, 596(sp)
+; ZVFHMIN32-NEXT:    lh a6, 340(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 5
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a5
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a6
+; ZVFHMIN32-NEXT:    feq.h a5, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a5, 42(sp)
+; ZVFHMIN32-NEXT:    lh a6, 594(sp)
+; ZVFHMIN32-NEXT:    lh a7, 338(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 5
+; ZVFHMIN32-NEXT:    vmv.x.s a5, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN32-NEXT:    feq.h a6, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a6, 41(sp)
+; ZVFHMIN32-NEXT:    lh a7, 592(sp)
+; ZVFHMIN32-NEXT:    lh t0, 336(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 4
+; ZVFHMIN32-NEXT:    vmv.x.s a6, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a7
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t0
+; ZVFHMIN32-NEXT:    feq.h a7, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a7, 40(sp)
+; ZVFHMIN32-NEXT:    lh t0, 590(sp)
+; ZVFHMIN32-NEXT:    lh t1, 334(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 4
+; ZVFHMIN32-NEXT:    vmv.x.s a7, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
+; ZVFHMIN32-NEXT:    feq.h t0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t0, 39(sp)
+; ZVFHMIN32-NEXT:    lh t1, 588(sp)
+; ZVFHMIN32-NEXT:    lh t2, 332(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 3
+; ZVFHMIN32-NEXT:    vmv.x.s t0, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN32-NEXT:    feq.h t1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t1, 38(sp)
+; ZVFHMIN32-NEXT:    lh t2, 586(sp)
+; ZVFHMIN32-NEXT:    lh t3, 330(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 3
+; ZVFHMIN32-NEXT:    vmv.x.s t1, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t2, 37(sp)
+; ZVFHMIN32-NEXT:    lh t2, 584(sp)
+; ZVFHMIN32-NEXT:    lh t3, 328(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 2
+; ZVFHMIN32-NEXT:    vmv.x.s t4, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t2, 36(sp)
+; ZVFHMIN32-NEXT:    lh t2, 582(sp)
+; ZVFHMIN32-NEXT:    lh t3, 326(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 2
+; ZVFHMIN32-NEXT:    vmv.x.s t5, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t2, 35(sp)
+; ZVFHMIN32-NEXT:    lh t2, 580(sp)
+; ZVFHMIN32-NEXT:    lh t3, 324(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 1
+; ZVFHMIN32-NEXT:    vmv.x.s t6, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t2, 34(sp)
+; ZVFHMIN32-NEXT:    lh t2, 578(sp)
+; ZVFHMIN32-NEXT:    lh t3, 322(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 1
+; ZVFHMIN32-NEXT:    vmv.x.s s2, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
+; ZVFHMIN32-NEXT:    sb a2, 5(sp)
+; ZVFHMIN32-NEXT:    sb a1, 6(sp)
+; ZVFHMIN32-NEXT:    sb a0, 7(sp)
+; ZVFHMIN32-NEXT:    sb t2, 33(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
+; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t5
+; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s2
+; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
+; ZVFHMIN32-NEXT:    sb a3, 1(sp)
+; ZVFHMIN32-NEXT:    sb a2, 2(sp)
+; ZVFHMIN32-NEXT:    sb a1, 3(sp)
+; ZVFHMIN32-NEXT:    sb a0, 4(sp)
+; ZVFHMIN32-NEXT:    lh a0, 482(sp)
+; ZVFHMIN32-NEXT:    lh a1, 226(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 10(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 146(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 402(sp)
+; ZVFHMIN32-NEXT:    sb a0, 113(sp)
+; ZVFHMIN32-NEXT:    lh a0, 480(sp)
+; ZVFHMIN32-NEXT:    lh a1, 224(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 9(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 144(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 400(sp)
+; ZVFHMIN32-NEXT:    sb a0, 112(sp)
+; ZVFHMIN32-NEXT:    lh a0, 478(sp)
+; ZVFHMIN32-NEXT:    lh a1, 222(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 8(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 142(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 398(sp)
+; ZVFHMIN32-NEXT:    sb a0, 111(sp)
+; ZVFHMIN32-NEXT:    lh a1, 476(sp)
+; ZVFHMIN32-NEXT:    lh a2, 220(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 7
+; ZVFHMIN32-NEXT:    vmv.x.s a0, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 110(sp)
+; ZVFHMIN32-NEXT:    lh a2, 474(sp)
+; ZVFHMIN32-NEXT:    lh a3, 218(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 7
+; ZVFHMIN32-NEXT:    vmv.x.s a1, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a2, 109(sp)
+; ZVFHMIN32-NEXT:    lh a3, 472(sp)
+; ZVFHMIN32-NEXT:    lh a4, 216(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 6
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a3
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a3, 108(sp)
+; ZVFHMIN32-NEXT:    lh a4, 470(sp)
+; ZVFHMIN32-NEXT:    lh a5, 214(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 6
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a4, 107(sp)
+; ZVFHMIN32-NEXT:    lh a5, 468(sp)
+; ZVFHMIN32-NEXT:    lh a6, 212(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 5
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a5
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a6
+; ZVFHMIN32-NEXT:    feq.h a5, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a5, 106(sp)
+; ZVFHMIN32-NEXT:    lh a6, 466(sp)
+; ZVFHMIN32-NEXT:    lh a7, 210(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 5
+; ZVFHMIN32-NEXT:    vmv.x.s a5, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN32-NEXT:    feq.h a6, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a6, 105(sp)
+; ZVFHMIN32-NEXT:    lh a7, 464(sp)
+; ZVFHMIN32-NEXT:    lh t0, 208(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 4
+; ZVFHMIN32-NEXT:    vmv.x.s a6, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a7
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t0
+; ZVFHMIN32-NEXT:    feq.h a7, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a7, 104(sp)
+; ZVFHMIN32-NEXT:    lh t0, 462(sp)
+; ZVFHMIN32-NEXT:    lh t1, 206(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 4
+; ZVFHMIN32-NEXT:    vmv.x.s a7, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
+; ZVFHMIN32-NEXT:    feq.h t0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t0, 103(sp)
+; ZVFHMIN32-NEXT:    lh t1, 460(sp)
+; ZVFHMIN32-NEXT:    lh t2, 204(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 3
+; ZVFHMIN32-NEXT:    vmv.x.s t0, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN32-NEXT:    feq.h t1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t1, 102(sp)
+; ZVFHMIN32-NEXT:    lh t2, 458(sp)
+; ZVFHMIN32-NEXT:    lh t3, 202(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 3
+; ZVFHMIN32-NEXT:    vmv.x.s t1, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t2, 101(sp)
+; ZVFHMIN32-NEXT:    lh t2, 456(sp)
+; ZVFHMIN32-NEXT:    lh t3, 200(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 2
+; ZVFHMIN32-NEXT:    vmv.x.s t4, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t2, 100(sp)
+; ZVFHMIN32-NEXT:    lh t2, 454(sp)
+; ZVFHMIN32-NEXT:    lh t3, 198(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 2
+; ZVFHMIN32-NEXT:    vmv.x.s t5, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t2, 99(sp)
+; ZVFHMIN32-NEXT:    lh t2, 452(sp)
+; ZVFHMIN32-NEXT:    lh t3, 196(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 1
+; ZVFHMIN32-NEXT:    vmv.x.s t6, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t2, 98(sp)
+; ZVFHMIN32-NEXT:    lh t2, 450(sp)
+; ZVFHMIN32-NEXT:    lh t3, 194(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 1
+; ZVFHMIN32-NEXT:    vmv.x.s s2, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
+; ZVFHMIN32-NEXT:    sb a2, 69(sp)
+; ZVFHMIN32-NEXT:    sb a1, 70(sp)
+; ZVFHMIN32-NEXT:    sb a0, 71(sp)
+; ZVFHMIN32-NEXT:    sb t2, 97(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
+; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t5
+; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s2
+; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
+; ZVFHMIN32-NEXT:    sb a3, 65(sp)
+; ZVFHMIN32-NEXT:    sb a2, 66(sp)
+; ZVFHMIN32-NEXT:    sb a1, 67(sp)
+; ZVFHMIN32-NEXT:    sb a0, 68(sp)
+; ZVFHMIN32-NEXT:    lh a0, 638(sp)
+; ZVFHMIN32-NEXT:    lh a1, 382(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 7(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 140(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 396(sp)
+; ZVFHMIN32-NEXT:    sb a0, 63(sp)
+; ZVFHMIN32-NEXT:    lh a0, 636(sp)
+; ZVFHMIN32-NEXT:    lh a1, 380(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 6(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 138(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 394(sp)
+; ZVFHMIN32-NEXT:    sb a0, 62(sp)
+; ZVFHMIN32-NEXT:    lh a0, 634(sp)
+; ZVFHMIN32-NEXT:    lh a1, 378(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 5(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 136(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 392(sp)
+; ZVFHMIN32-NEXT:    sb a0, 61(sp)
+; ZVFHMIN32-NEXT:    lh a0, 632(sp)
+; ZVFHMIN32-NEXT:    lh a1, 376(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 4(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 134(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 390(sp)
+; ZVFHMIN32-NEXT:    sb a0, 60(sp)
+; ZVFHMIN32-NEXT:    lh a0, 630(sp)
+; ZVFHMIN32-NEXT:    lh a1, 374(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 3(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 132(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 388(sp)
+; ZVFHMIN32-NEXT:    sb a0, 59(sp)
+; ZVFHMIN32-NEXT:    lh a0, 628(sp)
+; ZVFHMIN32-NEXT:    lh a1, 372(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 2(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 130(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 386(sp)
+; ZVFHMIN32-NEXT:    sb a0, 58(sp)
+; ZVFHMIN32-NEXT:    lh a0, 626(sp)
+; ZVFHMIN32-NEXT:    lh a1, 370(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 1(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 128(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 384(sp)
+; ZVFHMIN32-NEXT:    sb a0, 57(sp)
+; ZVFHMIN32-NEXT:    lh a0, 624(sp)
+; ZVFHMIN32-NEXT:    lh a1, 368(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 0(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 280(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 536(sp)
+; ZVFHMIN32-NEXT:    sb a0, 56(sp)
+; ZVFHMIN32-NEXT:    lh a0, 622(sp)
+; ZVFHMIN32-NEXT:    lh a1, 366(sp)
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v20, v0, 15
+; ZVFHMIN32-NEXT:    vslidedown.vi v22, v0, 14
+; ZVFHMIN32-NEXT:    vslidedown.vi v26, v0, 13
+; ZVFHMIN32-NEXT:    vslidedown.vi v28, v0, 12
+; ZVFHMIN32-NEXT:    vslidedown.vi v18, v0, 11
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 10
+; ZVFHMIN32-NEXT:    vslidedown.vi v12, v0, 9
+; ZVFHMIN32-NEXT:    vslidedown.vi v14, v0, 8
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v20
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 76(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 278(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 534(sp)
+; ZVFHMIN32-NEXT:    sb a0, 55(sp)
+; ZVFHMIN32-NEXT:    lh a0, 620(sp)
+; ZVFHMIN32-NEXT:    lh a1, 364(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 15
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v20
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v22
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 75(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 276(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 532(sp)
+; ZVFHMIN32-NEXT:    sb a0, 54(sp)
+; ZVFHMIN32-NEXT:    lh a0, 618(sp)
+; ZVFHMIN32-NEXT:    lh a1, 362(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 14
+; ZVFHMIN32-NEXT:    vmv.x.s a5, v20
+; ZVFHMIN32-NEXT:    vmv.x.s a6, v26
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 74(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 274(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 530(sp)
+; ZVFHMIN32-NEXT:    sb a0, 53(sp)
+; ZVFHMIN32-NEXT:    lh a0, 616(sp)
+; ZVFHMIN32-NEXT:    lh a1, 360(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 13
+; ZVFHMIN32-NEXT:    vmv.x.s a7, v20
+; ZVFHMIN32-NEXT:    vmv.x.s t0, v28
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 73(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 272(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 528(sp)
+; ZVFHMIN32-NEXT:    sb a0, 52(sp)
+; ZVFHMIN32-NEXT:    lh a0, 614(sp)
+; ZVFHMIN32-NEXT:    lh a1, 358(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 12
+; ZVFHMIN32-NEXT:    vmv.x.s t1, v20
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 72(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 270(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 526(sp)
+; ZVFHMIN32-NEXT:    sb a0, 51(sp)
+; ZVFHMIN32-NEXT:    lh a0, 612(sp)
+; ZVFHMIN32-NEXT:    lh a1, 356(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s t2, v18
+; ZVFHMIN32-NEXT:    vslidedown.vi v18, v8, 11
+; ZVFHMIN32-NEXT:    vmv.x.s t3, v18
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 71(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 268(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 524(sp)
+; ZVFHMIN32-NEXT:    sb a0, 50(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
+; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
+; ZVFHMIN32-NEXT:    sb a3, 12(sp)
+; ZVFHMIN32-NEXT:    sb a2, 13(sp)
+; ZVFHMIN32-NEXT:    sb a1, 14(sp)
+; ZVFHMIN32-NEXT:    sb a0, 15(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
+; ZVFHMIN32-NEXT:    vmv.x.s a1, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 10
+; ZVFHMIN32-NEXT:    vmv.x.s a1, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v12
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 9
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v10
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v14
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a3
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v8, 8
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
+; ZVFHMIN32-NEXT:    sb a3, 8(sp)
+; ZVFHMIN32-NEXT:    sb a2, 9(sp)
+; ZVFHMIN32-NEXT:    sb a1, 10(sp)
+; ZVFHMIN32-NEXT:    sb a0, 11(sp)
+; ZVFHMIN32-NEXT:    lh a0, 510(sp)
+; ZVFHMIN32-NEXT:    lh a1, 254(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 70(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 266(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 522(sp)
+; ZVFHMIN32-NEXT:    sb a0, 127(sp)
+; ZVFHMIN32-NEXT:    lh a0, 508(sp)
+; ZVFHMIN32-NEXT:    lh a1, 252(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 69(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 264(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 520(sp)
+; ZVFHMIN32-NEXT:    sb a0, 126(sp)
+; ZVFHMIN32-NEXT:    lh a0, 506(sp)
+; ZVFHMIN32-NEXT:    lh a1, 250(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 68(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 262(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 518(sp)
+; ZVFHMIN32-NEXT:    sb a0, 125(sp)
+; ZVFHMIN32-NEXT:    lh a0, 504(sp)
+; ZVFHMIN32-NEXT:    lh a1, 248(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 67(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 260(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 516(sp)
+; ZVFHMIN32-NEXT:    sb a0, 124(sp)
+; ZVFHMIN32-NEXT:    lh a0, 502(sp)
+; ZVFHMIN32-NEXT:    lh a1, 246(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 66(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 258(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 514(sp)
+; ZVFHMIN32-NEXT:    sb a0, 123(sp)
+; ZVFHMIN32-NEXT:    lh a0, 500(sp)
+; ZVFHMIN32-NEXT:    lh a1, 244(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 65(sp)
-; ZVFHMIN32-NEXT:    flh fa5, 256(sp)
-; ZVFHMIN32-NEXT:    flh fa4, 512(sp)
+; ZVFHMIN32-NEXT:    sb a0, 122(sp)
+; ZVFHMIN32-NEXT:    lh a0, 498(sp)
+; ZVFHMIN32-NEXT:    lh a1, 242(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 15
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 14
+; ZVFHMIN32-NEXT:    vslidedown.vi v12, v24, 13
+; ZVFHMIN32-NEXT:    vslidedown.vi v14, v24, 12
+; ZVFHMIN32-NEXT:    vslidedown.vi v18, v24, 11
+; ZVFHMIN32-NEXT:    vslidedown.vi v20, v24, 10
+; ZVFHMIN32-NEXT:    vslidedown.vi v22, v24, 9
+; ZVFHMIN32-NEXT:    vslidedown.vi v24, v24, 8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 64(sp)
+; ZVFHMIN32-NEXT:    sb a0, 121(sp)
+; ZVFHMIN32-NEXT:    lh a2, 496(sp)
+; ZVFHMIN32-NEXT:    lh a3, 240(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 15
+; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a2, 120(sp)
+; ZVFHMIN32-NEXT:    lh a4, 494(sp)
+; ZVFHMIN32-NEXT:    lh a5, 238(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v10
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 14
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a4, 119(sp)
+; ZVFHMIN32-NEXT:    lh a4, 492(sp)
+; ZVFHMIN32-NEXT:    lh a5, 236(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a6, v12
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 13
+; ZVFHMIN32-NEXT:    vmv.x.s a7, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a4, 118(sp)
+; ZVFHMIN32-NEXT:    lh a4, 490(sp)
+; ZVFHMIN32-NEXT:    lh a5, 234(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s t0, v14
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 12
+; ZVFHMIN32-NEXT:    vmv.x.s t1, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a4, 117(sp)
+; ZVFHMIN32-NEXT:    lh a4, 488(sp)
+; ZVFHMIN32-NEXT:    lh a5, 232(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s t2, v18
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 11
+; ZVFHMIN32-NEXT:    vmv.x.s t3, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a4, 116(sp)
+; ZVFHMIN32-NEXT:    lh a4, 486(sp)
+; ZVFHMIN32-NEXT:    lh a5, 230(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s t4, v20
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 10
+; ZVFHMIN32-NEXT:    vmv.x.s t5, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a4, 115(sp)
+; ZVFHMIN32-NEXT:    lh a4, 484(sp)
+; ZVFHMIN32-NEXT:    lh a5, 228(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s t6, v22
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 9
+; ZVFHMIN32-NEXT:    vmv.x.s s2, v8
+; ZVFHMIN32-NEXT:    vmv.x.s s3, v24
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a4, 114(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
+; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
+; ZVFHMIN32-NEXT:    sb a3, 76(sp)
+; ZVFHMIN32-NEXT:    sb a2, 77(sp)
+; ZVFHMIN32-NEXT:    sb a1, 78(sp)
+; ZVFHMIN32-NEXT:    sb a0, 79(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t5
+; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s2
+; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s3
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 8
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
+; ZVFHMIN32-NEXT:    sb a3, 72(sp)
+; ZVFHMIN32-NEXT:    sb a2, 73(sp)
+; ZVFHMIN32-NEXT:    sb a1, 74(sp)
+; ZVFHMIN32-NEXT:    sb a0, 75(sp)
 ; ZVFHMIN32-NEXT:    li a0, 128
 ; ZVFHMIN32-NEXT:    mv a1, sp
 ; ZVFHMIN32-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
@@ -1655,6 +1977,8 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    addi sp, s0, -768
 ; ZVFHMIN32-NEXT:    lw ra, 764(sp) # 4-byte Folded Reload
 ; ZVFHMIN32-NEXT:    lw s0, 760(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s2, 756(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s3, 752(sp) # 4-byte Folded Reload
 ; ZVFHMIN32-NEXT:    addi sp, sp, 768
 ; ZVFHMIN32-NEXT:    ret
 ;
@@ -1664,8 +1988,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    .cfi_def_cfa_offset 768
 ; ZVFHMIN64-NEXT:    sd ra, 760(sp) # 8-byte Folded Spill
 ; ZVFHMIN64-NEXT:    sd s0, 752(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s2, 744(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s3, 736(sp) # 8-byte Folded Spill
 ; ZVFHMIN64-NEXT:    .cfi_offset ra, -8
 ; ZVFHMIN64-NEXT:    .cfi_offset s0, -16
+; ZVFHMIN64-NEXT:    .cfi_offset s2, -24
+; ZVFHMIN64-NEXT:    .cfi_offset s3, -32
 ; ZVFHMIN64-NEXT:    addi s0, sp, 768
 ; ZVFHMIN64-NEXT:    .cfi_def_cfa s0, 0
 ; ZVFHMIN64-NEXT:    andi sp, sp, -128
@@ -1674,526 +2002,844 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
 ; ZVFHMIN64-NEXT:    vle16.v v24, (a1)
 ; ZVFHMIN64-NEXT:    vle16.v v0, (a0)
-; ZVFHMIN64-NEXT:    addi a0, sp, 128
+; ZVFHMIN64-NEXT:    addi a0, sp, 512
 ; ZVFHMIN64-NEXT:    vse16.v v8, (a0)
-; ZVFHMIN64-NEXT:    addi a0, sp, 384
-; ZVFHMIN64-NEXT:    vse16.v v0, (a0)
 ; ZVFHMIN64-NEXT:    addi a0, sp, 256
+; ZVFHMIN64-NEXT:    vse16.v v0, (a0)
+; ZVFHMIN64-NEXT:    addi a0, sp, 384
 ; ZVFHMIN64-NEXT:    vse16.v v16, (a0)
-; ZVFHMIN64-NEXT:    addi a0, sp, 512
+; ZVFHMIN64-NEXT:    addi a0, sp, 128
 ; ZVFHMIN64-NEXT:    vse16.v v24, (a0)
-; ZVFHMIN64-NEXT:    flh fa5, 254(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 510(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 63(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 252(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 508(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 62(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 250(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 506(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 61(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 248(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 504(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 60(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 246(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 502(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 59(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 244(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 500(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 58(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 242(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 498(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 57(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 240(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 496(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 56(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 238(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 494(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 55(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 236(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 492(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 54(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 234(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 490(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 53(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 232(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 488(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 52(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 230(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 486(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 51(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 228(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 484(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 50(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 226(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 482(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 49(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 224(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 480(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 48(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 222(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 478(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 47(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 382(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 638(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 127(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 380(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 636(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 126(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 378(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 634(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 125(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 376(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 632(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 124(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 374(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 630(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 123(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 372(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 628(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 122(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 370(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 626(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 121(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 368(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 624(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 120(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 366(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 622(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 119(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 364(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 620(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 118(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 362(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 618(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 117(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 360(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 616(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 116(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 358(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 614(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 115(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 356(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 612(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 114(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 354(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 610(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 113(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 352(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 608(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 112(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 350(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 606(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 111(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 220(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 476(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 46(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 218(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 474(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 45(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 216(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 472(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 44(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 214(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 470(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 43(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 212(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 468(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 42(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 210(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 466(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 41(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 208(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 464(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 40(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 206(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 462(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 39(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 204(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 460(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 38(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 202(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 458(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 37(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 200(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 456(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 36(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 198(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 454(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 35(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 196(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 452(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 34(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 194(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 450(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 33(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 192(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 448(sp)
+; ZVFHMIN64-NEXT:    lh a0, 576(sp)
+; ZVFHMIN64-NEXT:    lh a1, 320(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 32(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 190(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 446(sp)
+; ZVFHMIN64-NEXT:    lh a0, 574(sp)
+; ZVFHMIN64-NEXT:    lh a1, 318(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 31(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 188(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 444(sp)
+; ZVFHMIN64-NEXT:    lh a0, 572(sp)
+; ZVFHMIN64-NEXT:    lh a1, 316(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 30(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 348(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 604(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 110(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 346(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 602(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 109(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 344(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 600(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 108(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 342(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 598(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 107(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 340(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 596(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 106(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 338(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 594(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 105(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 336(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 592(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 104(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 334(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 590(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 103(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 332(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 588(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 102(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 330(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 586(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 101(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 328(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 584(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 100(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 326(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 582(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 99(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 324(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 580(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 98(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 322(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 578(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 97(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 320(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 576(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 96(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 318(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 574(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 95(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 316(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 572(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 94(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 186(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 442(sp)
+; ZVFHMIN64-NEXT:    lh a0, 570(sp)
+; ZVFHMIN64-NEXT:    lh a1, 314(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 29(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 184(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 440(sp)
+; ZVFHMIN64-NEXT:    lh a0, 568(sp)
+; ZVFHMIN64-NEXT:    lh a1, 312(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 28(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 182(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 438(sp)
+; ZVFHMIN64-NEXT:    lh a0, 566(sp)
+; ZVFHMIN64-NEXT:    lh a1, 310(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 27(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 180(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 436(sp)
+; ZVFHMIN64-NEXT:    lh a0, 564(sp)
+; ZVFHMIN64-NEXT:    lh a1, 308(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 26(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 178(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 434(sp)
+; ZVFHMIN64-NEXT:    lh a0, 562(sp)
+; ZVFHMIN64-NEXT:    lh a1, 306(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 25(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 176(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 432(sp)
+; ZVFHMIN64-NEXT:    lh a0, 560(sp)
+; ZVFHMIN64-NEXT:    lh a1, 304(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 24(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 174(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 430(sp)
+; ZVFHMIN64-NEXT:    lh a0, 558(sp)
+; ZVFHMIN64-NEXT:    lh a1, 302(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 23(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 172(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 428(sp)
+; ZVFHMIN64-NEXT:    lh a0, 556(sp)
+; ZVFHMIN64-NEXT:    lh a1, 300(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 22(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 170(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 426(sp)
+; ZVFHMIN64-NEXT:    lh a0, 554(sp)
+; ZVFHMIN64-NEXT:    lh a1, 298(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 21(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 168(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 424(sp)
+; ZVFHMIN64-NEXT:    lh a0, 552(sp)
+; ZVFHMIN64-NEXT:    lh a1, 296(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 20(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 166(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 422(sp)
+; ZVFHMIN64-NEXT:    lh a0, 550(sp)
+; ZVFHMIN64-NEXT:    lh a1, 294(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 19(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 164(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 420(sp)
+; ZVFHMIN64-NEXT:    lh a0, 548(sp)
+; ZVFHMIN64-NEXT:    lh a1, 292(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 18(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 162(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 418(sp)
+; ZVFHMIN64-NEXT:    lh a0, 546(sp)
+; ZVFHMIN64-NEXT:    lh a1, 290(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 17(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 160(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 416(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    lh a0, 544(sp)
+; ZVFHMIN64-NEXT:    lh a1, 288(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    vmv.x.s a1, v0
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
+; ZVFHMIN64-NEXT:    sb a1, 0(sp)
 ; ZVFHMIN64-NEXT:    sb a0, 16(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 158(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 414(sp)
+; ZVFHMIN64-NEXT:    lh a0, 448(sp)
+; ZVFHMIN64-NEXT:    lh a1, 192(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 15(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 156(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 412(sp)
+; ZVFHMIN64-NEXT:    sb a0, 96(sp)
+; ZVFHMIN64-NEXT:    lh a0, 446(sp)
+; ZVFHMIN64-NEXT:    lh a1, 190(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 14(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 154(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 410(sp)
+; ZVFHMIN64-NEXT:    sb a0, 95(sp)
+; ZVFHMIN64-NEXT:    lh a0, 444(sp)
+; ZVFHMIN64-NEXT:    lh a1, 188(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 13(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 314(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 570(sp)
+; ZVFHMIN64-NEXT:    sb a0, 94(sp)
+; ZVFHMIN64-NEXT:    lh a0, 442(sp)
+; ZVFHMIN64-NEXT:    lh a1, 186(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 93(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 312(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 568(sp)
+; ZVFHMIN64-NEXT:    lh a0, 440(sp)
+; ZVFHMIN64-NEXT:    lh a1, 184(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 92(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 310(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 566(sp)
+; ZVFHMIN64-NEXT:    lh a0, 438(sp)
+; ZVFHMIN64-NEXT:    lh a1, 182(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 91(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 308(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 564(sp)
+; ZVFHMIN64-NEXT:    lh a0, 436(sp)
+; ZVFHMIN64-NEXT:    lh a1, 180(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 90(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 306(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 562(sp)
+; ZVFHMIN64-NEXT:    lh a0, 434(sp)
+; ZVFHMIN64-NEXT:    lh a1, 178(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 89(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 304(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 560(sp)
+; ZVFHMIN64-NEXT:    lh a0, 432(sp)
+; ZVFHMIN64-NEXT:    lh a1, 176(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 88(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 302(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 558(sp)
+; ZVFHMIN64-NEXT:    lh a0, 430(sp)
+; ZVFHMIN64-NEXT:    lh a1, 174(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 87(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 300(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 556(sp)
+; ZVFHMIN64-NEXT:    lh a0, 428(sp)
+; ZVFHMIN64-NEXT:    lh a1, 172(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 86(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 298(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 554(sp)
+; ZVFHMIN64-NEXT:    lh a0, 426(sp)
+; ZVFHMIN64-NEXT:    lh a1, 170(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 85(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 296(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 552(sp)
+; ZVFHMIN64-NEXT:    lh a0, 424(sp)
+; ZVFHMIN64-NEXT:    lh a1, 168(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 84(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 294(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 550(sp)
+; ZVFHMIN64-NEXT:    lh a0, 422(sp)
+; ZVFHMIN64-NEXT:    lh a1, 166(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 83(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 292(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 548(sp)
+; ZVFHMIN64-NEXT:    lh a0, 420(sp)
+; ZVFHMIN64-NEXT:    lh a1, 164(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 82(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 290(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 546(sp)
+; ZVFHMIN64-NEXT:    lh a0, 418(sp)
+; ZVFHMIN64-NEXT:    lh a1, 162(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 81(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 288(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 544(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    lh a0, 416(sp)
+; ZVFHMIN64-NEXT:    lh a1, 160(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    vmv.x.s a1, v24
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    vmv.x.s a1, v16
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
+; ZVFHMIN64-NEXT:    sb a1, 64(sp)
 ; ZVFHMIN64-NEXT:    sb a0, 80(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 286(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 542(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 79(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 284(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 540(sp)
+; ZVFHMIN64-NEXT:    lh a0, 610(sp)
+; ZVFHMIN64-NEXT:    lh a1, 354(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 78(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 282(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 538(sp)
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 77(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 152(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 408(sp)
+; ZVFHMIN64-NEXT:    sb a0, 49(sp)
+; ZVFHMIN64-NEXT:    lh a0, 608(sp)
+; ZVFHMIN64-NEXT:    lh a1, 352(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 12(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 150(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 406(sp)
+; ZVFHMIN64-NEXT:    sb a0, 48(sp)
+; ZVFHMIN64-NEXT:    lh a0, 606(sp)
+; ZVFHMIN64-NEXT:    lh a1, 350(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 11(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 148(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 404(sp)
+; ZVFHMIN64-NEXT:    sb a0, 47(sp)
+; ZVFHMIN64-NEXT:    lh a1, 604(sp)
+; ZVFHMIN64-NEXT:    lh a2, 348(sp)
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 7
+; ZVFHMIN64-NEXT:    vmv.x.s a0, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 46(sp)
+; ZVFHMIN64-NEXT:    lh a2, 602(sp)
+; ZVFHMIN64-NEXT:    lh a3, 346(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 7
+; ZVFHMIN64-NEXT:    vmv.x.s a1, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a2, 45(sp)
+; ZVFHMIN64-NEXT:    lh a3, 600(sp)
+; ZVFHMIN64-NEXT:    lh a4, 344(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 6
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a3
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a3, 44(sp)
+; ZVFHMIN64-NEXT:    lh a4, 598(sp)
+; ZVFHMIN64-NEXT:    lh a5, 342(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 6
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a4, 43(sp)
+; ZVFHMIN64-NEXT:    lh a5, 596(sp)
+; ZVFHMIN64-NEXT:    lh a6, 340(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 5
+; ZVFHMIN64-NEXT:    vmv.x.s a4, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a5
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a6
+; ZVFHMIN64-NEXT:    feq.h a5, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a5, 42(sp)
+; ZVFHMIN64-NEXT:    lh a6, 594(sp)
+; ZVFHMIN64-NEXT:    lh a7, 338(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 5
+; ZVFHMIN64-NEXT:    vmv.x.s a5, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN64-NEXT:    feq.h a6, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a6, 41(sp)
+; ZVFHMIN64-NEXT:    lh a7, 592(sp)
+; ZVFHMIN64-NEXT:    lh t0, 336(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 4
+; ZVFHMIN64-NEXT:    vmv.x.s a6, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a7
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t0
+; ZVFHMIN64-NEXT:    feq.h a7, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a7, 40(sp)
+; ZVFHMIN64-NEXT:    lh t0, 590(sp)
+; ZVFHMIN64-NEXT:    lh t1, 334(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 4
+; ZVFHMIN64-NEXT:    vmv.x.s a7, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
+; ZVFHMIN64-NEXT:    feq.h t0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t0, 39(sp)
+; ZVFHMIN64-NEXT:    lh t1, 588(sp)
+; ZVFHMIN64-NEXT:    lh t2, 332(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 3
+; ZVFHMIN64-NEXT:    vmv.x.s t0, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN64-NEXT:    feq.h t1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t1, 38(sp)
+; ZVFHMIN64-NEXT:    lh t2, 586(sp)
+; ZVFHMIN64-NEXT:    lh t3, 330(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 3
+; ZVFHMIN64-NEXT:    vmv.x.s t1, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t2, 37(sp)
+; ZVFHMIN64-NEXT:    lh t2, 584(sp)
+; ZVFHMIN64-NEXT:    lh t3, 328(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 2
+; ZVFHMIN64-NEXT:    vmv.x.s t4, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t2, 36(sp)
+; ZVFHMIN64-NEXT:    lh t2, 582(sp)
+; ZVFHMIN64-NEXT:    lh t3, 326(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 2
+; ZVFHMIN64-NEXT:    vmv.x.s t5, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t2, 35(sp)
+; ZVFHMIN64-NEXT:    lh t2, 580(sp)
+; ZVFHMIN64-NEXT:    lh t3, 324(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 1
+; ZVFHMIN64-NEXT:    vmv.x.s t6, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t2, 34(sp)
+; ZVFHMIN64-NEXT:    lh t2, 578(sp)
+; ZVFHMIN64-NEXT:    lh t3, 322(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 1
+; ZVFHMIN64-NEXT:    vmv.x.s s2, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
+; ZVFHMIN64-NEXT:    sb a2, 5(sp)
+; ZVFHMIN64-NEXT:    sb a1, 6(sp)
+; ZVFHMIN64-NEXT:    sb a0, 7(sp)
+; ZVFHMIN64-NEXT:    sb t2, 33(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
+; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t5
+; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s2
+; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
+; ZVFHMIN64-NEXT:    sb a3, 1(sp)
+; ZVFHMIN64-NEXT:    sb a2, 2(sp)
+; ZVFHMIN64-NEXT:    sb a1, 3(sp)
+; ZVFHMIN64-NEXT:    sb a0, 4(sp)
+; ZVFHMIN64-NEXT:    lh a0, 482(sp)
+; ZVFHMIN64-NEXT:    lh a1, 226(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 10(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 146(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 402(sp)
+; ZVFHMIN64-NEXT:    sb a0, 113(sp)
+; ZVFHMIN64-NEXT:    lh a0, 480(sp)
+; ZVFHMIN64-NEXT:    lh a1, 224(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 9(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 144(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 400(sp)
+; ZVFHMIN64-NEXT:    sb a0, 112(sp)
+; ZVFHMIN64-NEXT:    lh a0, 478(sp)
+; ZVFHMIN64-NEXT:    lh a1, 222(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 8(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 142(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 398(sp)
+; ZVFHMIN64-NEXT:    sb a0, 111(sp)
+; ZVFHMIN64-NEXT:    lh a1, 476(sp)
+; ZVFHMIN64-NEXT:    lh a2, 220(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 7
+; ZVFHMIN64-NEXT:    vmv.x.s a0, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 110(sp)
+; ZVFHMIN64-NEXT:    lh a2, 474(sp)
+; ZVFHMIN64-NEXT:    lh a3, 218(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 7
+; ZVFHMIN64-NEXT:    vmv.x.s a1, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a2, 109(sp)
+; ZVFHMIN64-NEXT:    lh a3, 472(sp)
+; ZVFHMIN64-NEXT:    lh a4, 216(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 6
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a3
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a3, 108(sp)
+; ZVFHMIN64-NEXT:    lh a4, 470(sp)
+; ZVFHMIN64-NEXT:    lh a5, 214(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 6
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a4, 107(sp)
+; ZVFHMIN64-NEXT:    lh a5, 468(sp)
+; ZVFHMIN64-NEXT:    lh a6, 212(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 5
+; ZVFHMIN64-NEXT:    vmv.x.s a4, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a5
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a6
+; ZVFHMIN64-NEXT:    feq.h a5, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a5, 106(sp)
+; ZVFHMIN64-NEXT:    lh a6, 466(sp)
+; ZVFHMIN64-NEXT:    lh a7, 210(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 5
+; ZVFHMIN64-NEXT:    vmv.x.s a5, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN64-NEXT:    feq.h a6, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a6, 105(sp)
+; ZVFHMIN64-NEXT:    lh a7, 464(sp)
+; ZVFHMIN64-NEXT:    lh t0, 208(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 4
+; ZVFHMIN64-NEXT:    vmv.x.s a6, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a7
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t0
+; ZVFHMIN64-NEXT:    feq.h a7, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a7, 104(sp)
+; ZVFHMIN64-NEXT:    lh t0, 462(sp)
+; ZVFHMIN64-NEXT:    lh t1, 206(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 4
+; ZVFHMIN64-NEXT:    vmv.x.s a7, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
+; ZVFHMIN64-NEXT:    feq.h t0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t0, 103(sp)
+; ZVFHMIN64-NEXT:    lh t1, 460(sp)
+; ZVFHMIN64-NEXT:    lh t2, 204(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 3
+; ZVFHMIN64-NEXT:    vmv.x.s t0, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN64-NEXT:    feq.h t1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t1, 102(sp)
+; ZVFHMIN64-NEXT:    lh t2, 458(sp)
+; ZVFHMIN64-NEXT:    lh t3, 202(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 3
+; ZVFHMIN64-NEXT:    vmv.x.s t1, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t2, 101(sp)
+; ZVFHMIN64-NEXT:    lh t2, 456(sp)
+; ZVFHMIN64-NEXT:    lh t3, 200(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 2
+; ZVFHMIN64-NEXT:    vmv.x.s t4, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t2, 100(sp)
+; ZVFHMIN64-NEXT:    lh t2, 454(sp)
+; ZVFHMIN64-NEXT:    lh t3, 198(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 2
+; ZVFHMIN64-NEXT:    vmv.x.s t5, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t2, 99(sp)
+; ZVFHMIN64-NEXT:    lh t2, 452(sp)
+; ZVFHMIN64-NEXT:    lh t3, 196(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 1
+; ZVFHMIN64-NEXT:    vmv.x.s t6, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t2, 98(sp)
+; ZVFHMIN64-NEXT:    lh t2, 450(sp)
+; ZVFHMIN64-NEXT:    lh t3, 194(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 1
+; ZVFHMIN64-NEXT:    vmv.x.s s2, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
+; ZVFHMIN64-NEXT:    sb a2, 69(sp)
+; ZVFHMIN64-NEXT:    sb a1, 70(sp)
+; ZVFHMIN64-NEXT:    sb a0, 71(sp)
+; ZVFHMIN64-NEXT:    sb t2, 97(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
+; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t5
+; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s2
+; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
+; ZVFHMIN64-NEXT:    sb a3, 65(sp)
+; ZVFHMIN64-NEXT:    sb a2, 66(sp)
+; ZVFHMIN64-NEXT:    sb a1, 67(sp)
+; ZVFHMIN64-NEXT:    sb a0, 68(sp)
+; ZVFHMIN64-NEXT:    lh a0, 638(sp)
+; ZVFHMIN64-NEXT:    lh a1, 382(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 7(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 140(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 396(sp)
+; ZVFHMIN64-NEXT:    sb a0, 63(sp)
+; ZVFHMIN64-NEXT:    lh a0, 636(sp)
+; ZVFHMIN64-NEXT:    lh a1, 380(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 6(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 138(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 394(sp)
+; ZVFHMIN64-NEXT:    sb a0, 62(sp)
+; ZVFHMIN64-NEXT:    lh a0, 634(sp)
+; ZVFHMIN64-NEXT:    lh a1, 378(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 5(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 136(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 392(sp)
+; ZVFHMIN64-NEXT:    sb a0, 61(sp)
+; ZVFHMIN64-NEXT:    lh a0, 632(sp)
+; ZVFHMIN64-NEXT:    lh a1, 376(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 4(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 134(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 390(sp)
+; ZVFHMIN64-NEXT:    sb a0, 60(sp)
+; ZVFHMIN64-NEXT:    lh a0, 630(sp)
+; ZVFHMIN64-NEXT:    lh a1, 374(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 3(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 132(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 388(sp)
+; ZVFHMIN64-NEXT:    sb a0, 59(sp)
+; ZVFHMIN64-NEXT:    lh a0, 628(sp)
+; ZVFHMIN64-NEXT:    lh a1, 372(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 2(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 130(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 386(sp)
+; ZVFHMIN64-NEXT:    sb a0, 58(sp)
+; ZVFHMIN64-NEXT:    lh a0, 626(sp)
+; ZVFHMIN64-NEXT:    lh a1, 370(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 1(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 128(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 384(sp)
+; ZVFHMIN64-NEXT:    sb a0, 57(sp)
+; ZVFHMIN64-NEXT:    lh a0, 624(sp)
+; ZVFHMIN64-NEXT:    lh a1, 368(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 0(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 280(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 536(sp)
+; ZVFHMIN64-NEXT:    sb a0, 56(sp)
+; ZVFHMIN64-NEXT:    lh a0, 622(sp)
+; ZVFHMIN64-NEXT:    lh a1, 366(sp)
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v20, v0, 15
+; ZVFHMIN64-NEXT:    vslidedown.vi v22, v0, 14
+; ZVFHMIN64-NEXT:    vslidedown.vi v26, v0, 13
+; ZVFHMIN64-NEXT:    vslidedown.vi v28, v0, 12
+; ZVFHMIN64-NEXT:    vslidedown.vi v18, v0, 11
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 10
+; ZVFHMIN64-NEXT:    vslidedown.vi v12, v0, 9
+; ZVFHMIN64-NEXT:    vslidedown.vi v14, v0, 8
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v20
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 76(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 278(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 534(sp)
+; ZVFHMIN64-NEXT:    sb a0, 55(sp)
+; ZVFHMIN64-NEXT:    lh a0, 620(sp)
+; ZVFHMIN64-NEXT:    lh a1, 364(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 15
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v20
+; ZVFHMIN64-NEXT:    vmv.x.s a4, v22
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 75(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 276(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 532(sp)
+; ZVFHMIN64-NEXT:    sb a0, 54(sp)
+; ZVFHMIN64-NEXT:    lh a0, 618(sp)
+; ZVFHMIN64-NEXT:    lh a1, 362(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 14
+; ZVFHMIN64-NEXT:    vmv.x.s a5, v20
+; ZVFHMIN64-NEXT:    vmv.x.s a6, v26
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 74(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 274(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 530(sp)
+; ZVFHMIN64-NEXT:    sb a0, 53(sp)
+; ZVFHMIN64-NEXT:    lh a0, 616(sp)
+; ZVFHMIN64-NEXT:    lh a1, 360(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 13
+; ZVFHMIN64-NEXT:    vmv.x.s a7, v20
+; ZVFHMIN64-NEXT:    vmv.x.s t0, v28
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 73(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 272(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 528(sp)
+; ZVFHMIN64-NEXT:    sb a0, 52(sp)
+; ZVFHMIN64-NEXT:    lh a0, 614(sp)
+; ZVFHMIN64-NEXT:    lh a1, 358(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 12
+; ZVFHMIN64-NEXT:    vmv.x.s t1, v20
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 72(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 270(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 526(sp)
+; ZVFHMIN64-NEXT:    sb a0, 51(sp)
+; ZVFHMIN64-NEXT:    lh a0, 612(sp)
+; ZVFHMIN64-NEXT:    lh a1, 356(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s t2, v18
+; ZVFHMIN64-NEXT:    vslidedown.vi v18, v8, 11
+; ZVFHMIN64-NEXT:    vmv.x.s t3, v18
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 71(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 268(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 524(sp)
+; ZVFHMIN64-NEXT:    sb a0, 50(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
+; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
+; ZVFHMIN64-NEXT:    sb a3, 12(sp)
+; ZVFHMIN64-NEXT:    sb a2, 13(sp)
+; ZVFHMIN64-NEXT:    sb a1, 14(sp)
+; ZVFHMIN64-NEXT:    sb a0, 15(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
+; ZVFHMIN64-NEXT:    vmv.x.s a1, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 10
+; ZVFHMIN64-NEXT:    vmv.x.s a1, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v12
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 9
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v14
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a3
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v8, 8
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
+; ZVFHMIN64-NEXT:    sb a3, 8(sp)
+; ZVFHMIN64-NEXT:    sb a2, 9(sp)
+; ZVFHMIN64-NEXT:    sb a1, 10(sp)
+; ZVFHMIN64-NEXT:    sb a0, 11(sp)
+; ZVFHMIN64-NEXT:    lh a0, 510(sp)
+; ZVFHMIN64-NEXT:    lh a1, 254(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 70(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 266(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 522(sp)
+; ZVFHMIN64-NEXT:    sb a0, 127(sp)
+; ZVFHMIN64-NEXT:    lh a0, 508(sp)
+; ZVFHMIN64-NEXT:    lh a1, 252(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 69(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 264(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 520(sp)
+; ZVFHMIN64-NEXT:    sb a0, 126(sp)
+; ZVFHMIN64-NEXT:    lh a0, 506(sp)
+; ZVFHMIN64-NEXT:    lh a1, 250(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 68(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 262(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 518(sp)
+; ZVFHMIN64-NEXT:    sb a0, 125(sp)
+; ZVFHMIN64-NEXT:    lh a0, 504(sp)
+; ZVFHMIN64-NEXT:    lh a1, 248(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 67(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 260(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 516(sp)
+; ZVFHMIN64-NEXT:    sb a0, 124(sp)
+; ZVFHMIN64-NEXT:    lh a0, 502(sp)
+; ZVFHMIN64-NEXT:    lh a1, 246(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 66(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 258(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 514(sp)
+; ZVFHMIN64-NEXT:    sb a0, 123(sp)
+; ZVFHMIN64-NEXT:    lh a0, 500(sp)
+; ZVFHMIN64-NEXT:    lh a1, 244(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 65(sp)
-; ZVFHMIN64-NEXT:    flh fa5, 256(sp)
-; ZVFHMIN64-NEXT:    flh fa4, 512(sp)
+; ZVFHMIN64-NEXT:    sb a0, 122(sp)
+; ZVFHMIN64-NEXT:    lh a0, 498(sp)
+; ZVFHMIN64-NEXT:    lh a1, 242(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 15
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 14
+; ZVFHMIN64-NEXT:    vslidedown.vi v12, v24, 13
+; ZVFHMIN64-NEXT:    vslidedown.vi v14, v24, 12
+; ZVFHMIN64-NEXT:    vslidedown.vi v18, v24, 11
+; ZVFHMIN64-NEXT:    vslidedown.vi v20, v24, 10
+; ZVFHMIN64-NEXT:    vslidedown.vi v22, v24, 9
+; ZVFHMIN64-NEXT:    vslidedown.vi v24, v24, 8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 64(sp)
+; ZVFHMIN64-NEXT:    sb a0, 121(sp)
+; ZVFHMIN64-NEXT:    lh a2, 496(sp)
+; ZVFHMIN64-NEXT:    lh a3, 240(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 15
+; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a2, 120(sp)
+; ZVFHMIN64-NEXT:    lh a4, 494(sp)
+; ZVFHMIN64-NEXT:    lh a5, 238(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v10
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 14
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a4, 119(sp)
+; ZVFHMIN64-NEXT:    lh a4, 492(sp)
+; ZVFHMIN64-NEXT:    lh a5, 236(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a6, v12
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 13
+; ZVFHMIN64-NEXT:    vmv.x.s a7, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a4, 118(sp)
+; ZVFHMIN64-NEXT:    lh a4, 490(sp)
+; ZVFHMIN64-NEXT:    lh a5, 234(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s t0, v14
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 12
+; ZVFHMIN64-NEXT:    vmv.x.s t1, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a4, 117(sp)
+; ZVFHMIN64-NEXT:    lh a4, 488(sp)
+; ZVFHMIN64-NEXT:    lh a5, 232(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s t2, v18
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 11
+; ZVFHMIN64-NEXT:    vmv.x.s t3, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a4, 116(sp)
+; ZVFHMIN64-NEXT:    lh a4, 486(sp)
+; ZVFHMIN64-NEXT:    lh a5, 230(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s t4, v20
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 10
+; ZVFHMIN64-NEXT:    vmv.x.s t5, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a4, 115(sp)
+; ZVFHMIN64-NEXT:    lh a4, 484(sp)
+; ZVFHMIN64-NEXT:    lh a5, 228(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s t6, v22
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 9
+; ZVFHMIN64-NEXT:    vmv.x.s s2, v8
+; ZVFHMIN64-NEXT:    vmv.x.s s3, v24
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a4, 114(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
+; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
+; ZVFHMIN64-NEXT:    sb a3, 76(sp)
+; ZVFHMIN64-NEXT:    sb a2, 77(sp)
+; ZVFHMIN64-NEXT:    sb a1, 78(sp)
+; ZVFHMIN64-NEXT:    sb a0, 79(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t5
+; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s2
+; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s3
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 8
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
+; ZVFHMIN64-NEXT:    sb a3, 72(sp)
+; ZVFHMIN64-NEXT:    sb a2, 73(sp)
+; ZVFHMIN64-NEXT:    sb a1, 74(sp)
+; ZVFHMIN64-NEXT:    sb a0, 75(sp)
 ; ZVFHMIN64-NEXT:    li a0, 128
 ; ZVFHMIN64-NEXT:    mv a1, sp
 ; ZVFHMIN64-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
@@ -2203,6 +2849,8 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    addi sp, s0, -768
 ; ZVFHMIN64-NEXT:    ld ra, 760(sp) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    ld s0, 752(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s2, 744(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s3, 736(sp) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    addi sp, sp, 768
 ; ZVFHMIN64-NEXT:    ret
   %v = call <128 x i1> @llvm.vp.fcmp.v128f16(<128 x half> %va, <128 x half> %vb, metadata !"oeq", <128 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll
index f32795e..0bc2d34 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll
@@ -8,34 +8,33 @@ define <8 x i1> @v8i1_v16i1(<16 x i1>) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v0
-; RV32-NEXT:    slli a1, a0, 19
+; RV32-NEXT:    slli a1, a0, 18
 ; RV32-NEXT:    srli a1, a1, 31
-; RV32-NEXT:    slli a2, a0, 26
-; RV32-NEXT:    srli a2, a2, 31
+; RV32-NEXT:    srli a2, a0, 31
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV32-NEXT:    vmv.v.x v8, a2
 ; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    slli a1, a0, 24
+; RV32-NEXT:    slli a1, a0, 27
 ; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    slli a1, a0, 29
+; RV32-NEXT:    slli a1, a0, 28
 ; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    slli a1, a0, 18
+; RV32-NEXT:    slli a1, a0, 19
 ; RV32-NEXT:    srli a1, a1, 31
-; RV32-NEXT:    slli a2, a0, 16
+; RV32-NEXT:    slli a2, a0, 26
 ; RV32-NEXT:    srli a2, a2, 31
 ; RV32-NEXT:    vmv.v.x v9, a2
 ; RV32-NEXT:    vslide1down.vx v9, v9, a1
-; RV32-NEXT:    slli a1, a0, 27
+; RV32-NEXT:    slli a1, a0, 24
 ; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    vslide1down.vx v9, v9, a1
-; RV32-NEXT:    slli a0, a0, 28
+; RV32-NEXT:    slli a0, a0, 29
 ; RV32-NEXT:    srli a0, a0, 31
 ; RV32-NEXT:    vmv.v.i v0, 15
 ; RV32-NEXT:    vslide1down.vx v9, v9, a0
-; RV32-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vand.vi v8, v9, 1
+; RV32-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV32-NEXT:    vand.vi v8, v8, 1
 ; RV32-NEXT:    vmsne.vi v0, v8, 0
 ; RV32-NEXT:    ret
 ;
@@ -43,34 +42,33 @@ define <8 x i1> @v8i1_v16i1(<16 x i1>) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64-NEXT:    vmv.x.s a0, v0
-; RV64-NEXT:    slli a1, a0, 51
+; RV64-NEXT:    slli a1, a0, 50
 ; RV64-NEXT:    srli a1, a1, 63
-; RV64-NEXT:    slli a2, a0, 58
-; RV64-NEXT:    srli a2, a2, 63
+; RV64-NEXT:    srli a2, a0, 63
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV64-NEXT:    vmv.v.x v8, a2
 ; RV64-NEXT:    vslide1down.vx v8, v8, a1
-; RV64-NEXT:    slli a1, a0, 56
+; RV64-NEXT:    slli a1, a0, 59
 ; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    vslide1down.vx v8, v8, a1
-; RV64-NEXT:    slli a1, a0, 61
+; RV64-NEXT:    slli a1, a0, 60
 ; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    vslide1down.vx v8, v8, a1
-; RV64-NEXT:    slli a1, a0, 50
+; RV64-NEXT:    slli a1, a0, 51
 ; RV64-NEXT:    srli a1, a1, 63
-; RV64-NEXT:    slli a2, a0, 48
+; RV64-NEXT:    slli a2, a0, 58
 ; RV64-NEXT:    srli a2, a2, 63
 ; RV64-NEXT:    vmv.v.x v9, a2
 ; RV64-NEXT:    vslide1down.vx v9, v9, a1
-; RV64-NEXT:    slli a1, a0, 59
+; RV64-NEXT:    slli a1, a0, 56
 ; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    vslide1down.vx v9, v9, a1
-; RV64-NEXT:    slli a0, a0, 60
+; RV64-NEXT:    slli a0, a0, 61
 ; RV64-NEXT:    srli a0, a0, 63
 ; RV64-NEXT:    vmv.v.i v0, 15
 ; RV64-NEXT:    vslide1down.vx v9, v9, a0
-; RV64-NEXT:    vslidedown.vi v9, v8, 4, v0.t
-; RV64-NEXT:    vand.vi v8, v9, 1
+; RV64-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64-NEXT:    vand.vi v8, v8, 1
 ; RV64-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-NEXT:    ret
   %2 = shufflevector <16 x i1> %0, <16 x i1> poison, <8 x i32> <i32 5, i32 12, i32 7, i32 2, i32 15, i32 13, i32 4, i32 3>
@@ -309,243 +307,14 @@ define <32 x i32> @v32i32_v4i32(<4 x i32>) {
   ret <32 x i32> %2
 }
 
-; TODO: This case should be a simple vnsrl, but gets scalarized instead
 define <32 x i8> @vnsrl_v32i8_v64i8(<64 x i8> %in) {
-; RV32-LABEL: vnsrl_v32i8_v64i8:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -128
-; RV32-NEXT:    .cfi_def_cfa_offset 128
-; RV32-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    addi s0, sp, 128
-; RV32-NEXT:    .cfi_def_cfa s0, 0
-; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    li a0, 64
-; RV32-NEXT:    mv a1, sp
-; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; RV32-NEXT:    vse8.v v8, (a1)
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 3
-; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 5
-; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 7
-; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 9
-; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 11
-; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 13
-; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 15
-; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vslidedown.vi v12, v8, 17
-; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vslidedown.vi v12, v8, 19
-; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vslidedown.vi v12, v8, 21
-; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vslidedown.vi v12, v8, 23
-; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vslidedown.vi v12, v8, 25
-; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vslidedown.vi v12, v8, 27
-; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vslidedown.vi v12, v8, 29
-; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vslidedown.vi v8, v8, 31
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vslide1down.vx v8, v10, a0
-; RV32-NEXT:    lbu a0, 33(sp)
-; RV32-NEXT:    lbu a1, 35(sp)
-; RV32-NEXT:    lbu a2, 37(sp)
-; RV32-NEXT:    lbu a3, 39(sp)
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a3
-; RV32-NEXT:    lbu a0, 41(sp)
-; RV32-NEXT:    lbu a1, 43(sp)
-; RV32-NEXT:    lbu a2, 45(sp)
-; RV32-NEXT:    lbu a3, 47(sp)
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a3
-; RV32-NEXT:    lbu a0, 49(sp)
-; RV32-NEXT:    lbu a1, 51(sp)
-; RV32-NEXT:    lbu a2, 53(sp)
-; RV32-NEXT:    lbu a3, 55(sp)
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a3
-; RV32-NEXT:    lbu a0, 57(sp)
-; RV32-NEXT:    lbu a1, 59(sp)
-; RV32-NEXT:    lbu a2, 61(sp)
-; RV32-NEXT:    lbu a3, 63(sp)
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a3
-; RV32-NEXT:    addi sp, s0, -128
-; RV32-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 128
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vnsrl_v32i8_v64i8:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -128
-; RV64-NEXT:    .cfi_def_cfa_offset 128
-; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    addi s0, sp, 128
-; RV64-NEXT:    .cfi_def_cfa s0, 0
-; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    li a0, 64
-; RV64-NEXT:    mv a1, sp
-; RV64-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; RV64-NEXT:    vse8.v v8, (a1)
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    vmv.x.s a0, v10
-; RV64-NEXT:    li a1, 32
-; RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a0
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 3
-; RV64-NEXT:    vmv.x.s a0, v12
-; RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 5
-; RV64-NEXT:    vmv.x.s a0, v12
-; RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 7
-; RV64-NEXT:    vmv.x.s a0, v12
-; RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 9
-; RV64-NEXT:    vmv.x.s a0, v12
-; RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 11
-; RV64-NEXT:    vmv.x.s a0, v12
-; RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 13
-; RV64-NEXT:    vmv.x.s a0, v12
-; RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 15
-; RV64-NEXT:    vmv.x.s a0, v12
-; RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vslidedown.vi v12, v8, 17
-; RV64-NEXT:    vmv.x.s a0, v12
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vslidedown.vi v12, v8, 19
-; RV64-NEXT:    vmv.x.s a0, v12
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vslidedown.vi v12, v8, 21
-; RV64-NEXT:    vmv.x.s a0, v12
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vslidedown.vi v12, v8, 23
-; RV64-NEXT:    vmv.x.s a0, v12
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vslidedown.vi v12, v8, 25
-; RV64-NEXT:    vmv.x.s a0, v12
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vslidedown.vi v12, v8, 27
-; RV64-NEXT:    vmv.x.s a0, v12
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vslidedown.vi v12, v8, 29
-; RV64-NEXT:    vmv.x.s a0, v12
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vslidedown.vi v8, v8, 31
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vslide1down.vx v8, v10, a0
-; RV64-NEXT:    lbu a0, 33(sp)
-; RV64-NEXT:    lbu a1, 35(sp)
-; RV64-NEXT:    lbu a2, 37(sp)
-; RV64-NEXT:    lbu a3, 39(sp)
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
-; RV64-NEXT:    vslide1down.vx v8, v8, a1
-; RV64-NEXT:    vslide1down.vx v8, v8, a2
-; RV64-NEXT:    vslide1down.vx v8, v8, a3
-; RV64-NEXT:    lbu a0, 41(sp)
-; RV64-NEXT:    lbu a1, 43(sp)
-; RV64-NEXT:    lbu a2, 45(sp)
-; RV64-NEXT:    lbu a3, 47(sp)
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
-; RV64-NEXT:    vslide1down.vx v8, v8, a1
-; RV64-NEXT:    vslide1down.vx v8, v8, a2
-; RV64-NEXT:    vslide1down.vx v8, v8, a3
-; RV64-NEXT:    lbu a0, 49(sp)
-; RV64-NEXT:    lbu a1, 51(sp)
-; RV64-NEXT:    lbu a2, 53(sp)
-; RV64-NEXT:    lbu a3, 55(sp)
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
-; RV64-NEXT:    vslide1down.vx v8, v8, a1
-; RV64-NEXT:    vslide1down.vx v8, v8, a2
-; RV64-NEXT:    vslide1down.vx v8, v8, a3
-; RV64-NEXT:    lbu a0, 57(sp)
-; RV64-NEXT:    lbu a1, 59(sp)
-; RV64-NEXT:    lbu a2, 61(sp)
-; RV64-NEXT:    lbu a3, 63(sp)
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
-; RV64-NEXT:    vslide1down.vx v8, v8, a1
-; RV64-NEXT:    vslide1down.vx v8, v8, a2
-; RV64-NEXT:    vslide1down.vx v8, v8, a3
-; RV64-NEXT:    addi sp, s0, -128
-; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 128
-; RV64-NEXT:    ret
+; CHECK-LABEL: vnsrl_v32i8_v64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v12, v8, 8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
   %res = shufflevector <64 x i8> %in, <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
   ret <32 x i8> %res
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1down.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1down.ll
index f531ff3..563b90d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1down.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1down.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh,+zfbfmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh,+zfbfmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV64
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
@@ -131,23 +133,61 @@ define <4 x i64> @vslide1down_4xi64(<4 x i64> %v, i64 %b) {
   ret <4 x i64> %v1
 }
 
-define <2 x half> @vslide1down_2xf16(<2 x half> %v, half %b) {
-; CHECK-LABEL: vslide1down_2xf16:
+define <2 x bfloat> @vslide1down_2xbf16(<2 x bfloat> %v, bfloat %b) {
+; CHECK-LABEL: vslide1down_2xbf16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vfslide1down.vf v8, v8, fa0
+; CHECK-NEXT:    vslide1down.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %vb = insertelement <2 x bfloat> poison, bfloat %b, i64 0
+  %v1 = shufflevector <2 x bfloat> %v, <2 x bfloat> %vb, <2 x i32> <i32 1, i32 2>
+  ret <2 x bfloat> %v1
+}
+
+define <4 x bfloat> @vslide1down_4xbf16(<4 x bfloat> %v, bfloat %b) {
+; CHECK-LABEL: vslide1down_4xbf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmv.x.h a0, fa0
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-NEXT:    ret
+  %vb = insertelement <4 x bfloat> poison, bfloat %b, i64 0
+  %v1 = shufflevector <4 x bfloat> %v, <4 x bfloat> %vb, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x bfloat> %v1
+}
+
+define <2 x half> @vslide1down_2xf16(<2 x half> %v, half %b) {
+; ZVFH-LABEL: vslide1down_2xf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfslide1down.vf v8, v8, fa0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vslide1down_2xf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a0
+; ZVFHMIN-NEXT:    ret
   %vb = insertelement <2 x half> poison, half %b, i64 0
   %v1 = shufflevector <2 x half> %v, <2 x half> %vb, <2 x i32> <i32 1, i32 2>
   ret <2 x half> %v1
 }
 
 define <4 x half> @vslide1down_4xf16(<4 x half> %v, half %b) {
-; CHECK-LABEL: vslide1down_4xf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vfslide1down.vf v8, v8, fa0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vslide1down_4xf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfslide1down.vf v8, v8, fa0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vslide1down_4xf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a0
+; ZVFHMIN-NEXT:    ret
   %vb = insertelement <4 x half> poison, half %b, i64 0
   %v1 = shufflevector <4 x half> %v, <4 x half> %vb, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   ret <4 x half> %v1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll
index b3390b6..0f6d68d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh,+zfbfmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh,+zfbfmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV64
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
@@ -140,25 +142,67 @@ define <4 x i64> @vslide1up_4xi64(<4 x i64> %v, i64 %b) {
   ret <4 x i64> %v1
 }
 
-define <2 x half> @vslide1up_2xf16(<2 x half> %v, half %b) {
-; CHECK-LABEL: vslide1up_2xf16:
+define <2 x bfloat> @vslide1up_2xbf16(<2 x bfloat> %v, bfloat %b) {
+; CHECK-LABEL: vslide1up_2xbf16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vfslide1up.vf v9, v8, fa0
+; CHECK-NEXT:    vslide1up.vx v9, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %vb = insertelement <2 x half> poison, half %b, i64 0
-  %v1 = shufflevector <2 x half> %v, <2 x half> %vb, <2 x i32> <i32 2, i32 0>
-  ret <2 x half> %v1
+  %vb = insertelement <2 x bfloat> poison, bfloat %b, i64 0
+  %v1 = shufflevector <2 x bfloat> %v, <2 x bfloat> %vb, <2 x i32> <i32 2, i32 0>
+  ret <2 x bfloat> %v1
 }
 
-define <4 x half> @vslide1up_4xf16(<4 x half> %v, half %b) {
-; CHECK-LABEL: vslide1up_4xf16:
+define <4 x bfloat> @vslide1up_4xbf16(<4 x bfloat> %v, bfloat %b) {
+; CHECK-LABEL: vslide1up_4xbf16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vfslide1up.vf v9, v8, fa0
+; CHECK-NEXT:    vslide1up.vx v9, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
+  %vb = insertelement <4 x bfloat> poison, bfloat %b, i64 0
+  %v1 = shufflevector <4 x bfloat> %v, <4 x bfloat> %vb, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
+  ret <4 x bfloat> %v1
+}
+
+define <2 x half> @vslide1up_2xf16(<2 x half> %v, half %b) {
+; ZVFH-LABEL: vslide1up_2xf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfslide1up.vf v9, v8, fa0
+; ZVFH-NEXT:    vmv1r.v v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vslide1up_2xf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vslide1up.vx v9, v8, a0
+; ZVFHMIN-NEXT:    vmv1r.v v8, v9
+; ZVFHMIN-NEXT:    ret
+  %vb = insertelement <2 x half> poison, half %b, i64 0
+  %v1 = shufflevector <2 x half> %v, <2 x half> %vb, <2 x i32> <i32 2, i32 0>
+  ret <2 x half> %v1
+}
+
+define <4 x half> @vslide1up_4xf16(<4 x half> %v, half %b) {
+; ZVFH-LABEL: vslide1up_4xf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfslide1up.vf v9, v8, fa0
+; ZVFH-NEXT:    vmv1r.v v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vslide1up_4xf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslide1up.vx v9, v8, a0
+; ZVFHMIN-NEXT:    vmv1r.v v8, v9
+; ZVFHMIN-NEXT:    ret
   %vb = insertelement <4 x half> poison, half %b, i64 0
   %v1 = shufflevector <4 x half> %v, <4 x half> %vb, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
   ret <4 x half> %v1
diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
index 307a053..3ccad02 100644
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -144,10 +144,9 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; RV32M-NEXT:    srai a1, a1, 28
 ; RV32M-NEXT:    slli a2, a1, 1
 ; RV32M-NEXT:    add a1, a2, a1
-; RV32M-NEXT:    srli a2, a1, 4
-; RV32M-NEXT:    slli a1, a1, 24
-; RV32M-NEXT:    srli a1, a1, 31
-; RV32M-NEXT:    add a1, a2, a1
+; RV32M-NEXT:    srli a2, a1, 31
+; RV32M-NEXT:    srli a1, a1, 4
+; RV32M-NEXT:    add a1, a1, a2
 ; RV32M-NEXT:    slli a2, a1, 3
 ; RV32M-NEXT:    slli a1, a1, 1
 ; RV32M-NEXT:    sub a1, a1, a2
@@ -163,10 +162,9 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; RV64M-NEXT:    srai a1, a1, 60
 ; RV64M-NEXT:    slli a2, a1, 1
 ; RV64M-NEXT:    add a1, a2, a1
-; RV64M-NEXT:    srli a2, a1, 4
-; RV64M-NEXT:    slli a1, a1, 56
-; RV64M-NEXT:    srli a1, a1, 63
-; RV64M-NEXT:    add a1, a2, a1
+; RV64M-NEXT:    srli a2, a1, 63
+; RV64M-NEXT:    srli a1, a1, 4
+; RV64M-NEXT:    add a1, a1, a2
 ; RV64M-NEXT:    slli a2, a1, 3
 ; RV64M-NEXT:    slli a1, a1, 1
 ; RV64M-NEXT:    subw a1, a1, a2
@@ -182,10 +180,9 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; RV32MV-NEXT:    srai a1, a1, 28
 ; RV32MV-NEXT:    slli a2, a1, 1
 ; RV32MV-NEXT:    add a1, a2, a1
-; RV32MV-NEXT:    srli a2, a1, 4
-; RV32MV-NEXT:    slli a1, a1, 24
-; RV32MV-NEXT:    srli a1, a1, 31
-; RV32MV-NEXT:    add a1, a2, a1
+; RV32MV-NEXT:    srli a2, a1, 31
+; RV32MV-NEXT:    srli a1, a1, 4
+; RV32MV-NEXT:    add a1, a1, a2
 ; RV32MV-NEXT:    slli a2, a1, 3
 ; RV32MV-NEXT:    slli a1, a1, 1
 ; RV32MV-NEXT:    sub a1, a1, a2
@@ -201,10 +198,9 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; RV64MV-NEXT:    srai a1, a1, 60
 ; RV64MV-NEXT:    slli a2, a1, 1
 ; RV64MV-NEXT:    add a1, a2, a1
-; RV64MV-NEXT:    srli a2, a1, 4
-; RV64MV-NEXT:    slli a1, a1, 56
-; RV64MV-NEXT:    srli a1, a1, 63
-; RV64MV-NEXT:    add a1, a2, a1
+; RV64MV-NEXT:    srli a2, a1, 63
+; RV64MV-NEXT:    srli a1, a1, 4
+; RV64MV-NEXT:    add a1, a1, a2
 ; RV64MV-NEXT:    slli a2, a1, 3
 ; RV64MV-NEXT:    slli a1, a1, 1
 ; RV64MV-NEXT:    subw a1, a1, a2
diff --git a/llvm/test/CodeGen/RISCV/varargs-with-fp-and-second-adj.ll b/llvm/test/CodeGen/RISCV/varargs-with-fp-and-second-adj.ll
new file mode 100644
index 0000000..b350cee
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/varargs-with-fp-and-second-adj.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv64 -mattr=+m,+c,+v  < %s | FileCheck --check-prefix=RV64V %s
+
+declare void @llvm.va_copy.p0(ptr, ptr)
+declare void @llvm.va_end.p0(ptr)
+
+define dso_local void @_Z3fooPKcz(ptr noundef %0, ...) "frame-pointer"="all" {
+; RV64V-LABEL: _Z3fooPKcz:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    addi sp, sp, -496
+; RV64V-NEXT:    .cfi_def_cfa_offset 496
+; RV64V-NEXT:    sd ra, 424(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    sd s0, 416(sp) # 8-byte Folded Spill
+; RV64V-NEXT:    .cfi_offset ra, -72
+; RV64V-NEXT:    .cfi_offset s0, -80
+; RV64V-NEXT:    addi s0, sp, 432
+; RV64V-NEXT:    .cfi_def_cfa s0, 64
+; RV64V-NEXT:    lui t0, 2
+; RV64V-NEXT:    addiw t0, t0, -576
+; RV64V-NEXT:    sub sp, sp, t0
+; RV64V-NEXT:    sd a5, 40(s0)
+; RV64V-NEXT:    sd a6, 48(s0)
+; RV64V-NEXT:    sd a7, 56(s0)
+; RV64V-NEXT:    sd a1, 8(s0)
+; RV64V-NEXT:    sd a2, 16(s0)
+; RV64V-NEXT:    sd a3, 24(s0)
+; RV64V-NEXT:    sd a4, 32(s0)
+; RV64V-NEXT:    sd a0, -32(s0)
+; RV64V-NEXT:    addi a0, s0, 8
+; RV64V-NEXT:    sd a0, -40(s0)
+; RV64V-NEXT:    addi sp, s0, -496
+; RV64V-NEXT:    ld ra, 424(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    ld s0, 416(sp) # 8-byte Folded Reload
+; RV64V-NEXT:    addi sp, sp, 496
+; RV64V-NEXT:    ret
+  %2 = alloca ptr, align 8
+  %3 = alloca ptr, align 8
+  %4 = alloca [8000 x i8], align 1
+  store ptr %0, ptr %2, align 8
+  call void @llvm.va_start.p0(ptr %3)
+  %5 = getelementptr inbounds [8000 x i8], ptr %4, i64 0, i64 0
+  %6 = load ptr, ptr %2, align 8
+  %7 = load ptr, ptr %3, align 8
+  call void @llvm.va_end.p0(ptr %3)
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/no-opbitcast-between-identical-types.ll b/llvm/test/CodeGen/SPIRV/no-opbitcast-between-identical-types.ll
new file mode 100644
index 0000000..9b19a32
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/no-opbitcast-between-identical-types.ll
@@ -0,0 +1,17 @@
+; The goal of the test case is to ensure that no OpBitcast is generated for a bitcast between identical types.
+
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpFunction
+; CHECK-NO: OpBitcast
+; CHECK: OpReturn
+
+define void @foo() {
+entry:
+  %r = bitcast i32 0 to i32
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/optimizations/add-check-overflow.ll b/llvm/test/CodeGen/SPIRV/optimizations/add-check-overflow.ll
index 1a630f7..e04678f 100644
--- a/llvm/test/CodeGen/SPIRV/optimizations/add-check-overflow.ll
+++ b/llvm/test/CodeGen/SPIRV/optimizations/add-check-overflow.ll
@@ -1,6 +1,8 @@
 ; This test aims to check ability to support "Arithmetic with Overflow" intrinsics
 ; in the special case when those intrinsics are being generated by the CodeGenPrepare;
-; pass during translations with optimization (note -O3 in llc arguments).
+; pass during translations with optimization (note -disable-lsr, to inhibit
+; strength reduction pre-empting with a more preferable match for this pattern
+; in llc arguments).
 
 ; RUN: llc -O3 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O3 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
@@ -8,34 +10,67 @@
 ; RUN: llc -O3 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O3 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK-DAG: OpName %[[Val:.*]] "math"
-; CHECK-DAG: OpName %[[IsOver:.*]] "ov"
+; RUN: llc -O3 -disable-lsr -mtriple=spirv32-unknown-unknown %s -o - | FileCheck --check-prefix=NOLSR %s
+; RUN: %if spirv-tools %{ llc -O3 -disable-lsr -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -O3 -disable-lsr -mtriple=spirv64-unknown-unknown %s -o - | FileCheck --check-prefix=NOLSR %s
+; RUN: %if spirv-tools %{ llc -O3 -disable-lsr -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: OpName %[[PhiRes:.*]] "lsr.iv"
+; CHECK-DAG: OpName %[[IsOver:.*]] "fl"
+; CHECK-DAG: OpName %[[Val:.*]] "lsr.iv.next"
 ; CHECK-DAG: %[[Int:.*]] = OpTypeInt 32 0
 ; CHECK-DAG: %[[Char:.*]] = OpTypeInt 8 0
 ; CHECK-DAG: %[[PtrChar:.*]] = OpTypePointer Generic %[[Char]]
 ; CHECK-DAG: %[[Bool:.*]] = OpTypeBool
-; CHECK-DAG: %[[Struct:.*]] = OpTypeStruct %[[Int]] %[[Int]]
 ; CHECK-DAG: %[[Const1:.*]] = OpConstant %[[Int]] 1
+; CHECK-DAG: %[[Zero:.*]] = OpConstant %[[Int]] 0
 ; CHECK-DAG: %[[Const42:.*]] = OpConstant %[[Char]] 42
-; CHECK-DAG: %[[Zero:.*]] = OpConstantNull %[[Int]]
 
 ; CHECK: OpFunction
 ; CHECK: %[[A:.*]] = OpFunctionParameter %[[Int]]
 ; CHECK: %[[Ptr:.*]] = OpFunctionParameter %[[PtrChar]]
-; CHECK: %[[#]] = OpLabel
-; CHECK: OpBranch %[[#]]
-; CHECK: %[[#]] = OpLabel
-; CHECK: %[[PhiRes:.*]] = OpPhi %[[Int]] %[[A]] %[[#]] %[[Val]] %[[#]]
-; CHECK: %[[AggRes:.*]] = OpIAddCarry %[[Struct]] %[[PhiRes]] %[[Const1]]
-; CHECK: %[[Val]] = OpCompositeExtract %[[Int]] %[[AggRes]] 0
-; CHECK: %[[Over:.*]] = OpCompositeExtract %[[Int]] %[[AggRes]] 1
-; CHECK: %[[IsOver]] = OpINotEqual %[[Bool:.*]] %[[Over]] %[[Zero]]
-; CHECK: OpBranchConditional %[[IsOver]] %[[#]] %[[#]]
-; CHECK: OpStore %[[Ptr]] %[[Const42]] Aligned 1
+; CHECK: %[[APlusOne:.*]] = OpIAdd %[[Int]] %[[A]] %[[Const1]]
+;	CHECK: OpBranch %[[#]]
+;	CHECK: [[#]] = OpLabel
+;	CHECK: %[[PhiRes]] = OpPhi %[[Int]] %[[Val]] %[[#]] %[[APlusOne]] %[[#]]
+;	CHECK: %[[IsOver]] = OpIEqual %[[Bool]] %[[#]] %[[#]]
+;	CHECK: OpBranchConditional %[[IsOver]] %[[#]] %[[#]]
+;	CHECK: [[#]] = OpLabel
+;	CHECK: OpStore %[[Ptr]] %[[Const42]] Aligned 1
+;	CHECK: [[Val]] = OpIAdd %[[Int]] %[[PhiRes]] %[[Const1]]
 ; CHECK: OpBranch %[[#]]
-; CHECK: %[[#]] = OpLabel
-; CHECK: OpReturnValue %[[Val]]
-; CHECK: OpFunctionEnd
+;	CHECK: [[#]] = OpLabel
+;	OpReturnValue %[[PhiRes]]
+
+; NOLSR-DAG: OpName %[[Val:.*]] "math"
+; NOLSR-DAG: OpName %[[IsOver:.*]] "ov"
+; NOLSR-DAG: %[[Int:.*]] = OpTypeInt 32 0
+; NOLSR-DAG: %[[Char:.*]] = OpTypeInt 8 0
+; NOLSR-DAG: %[[PtrChar:.*]] = OpTypePointer Generic %[[Char]]
+; NOLSR-DAG: %[[Bool:.*]] = OpTypeBool
+; NOLSR-DAG: %[[Struct:.*]] = OpTypeStruct %[[Int]] %[[Int]]
+; NOLSR-DAG: %[[Const1:.*]] = OpConstant %[[Int]] 1
+; NOLSR-DAG: %[[Const42:.*]] = OpConstant %[[Char]] 42
+; NOLSR-DAG: %[[Zero:.*]] = OpConstantNull %[[Int]]
+
+; NOLSR: OpFunction
+; NOLSR: %[[A:.*]] = OpFunctionParameter %[[Int]]
+; NOLSR: %[[Ptr:.*]] = OpFunctionParameter %[[PtrChar]]
+; NOLSR: %[[#]] = OpLabel
+; NOLSR: OpBranch %[[#]]
+; NOLSR: %[[#]] = OpLabel
+; NOLSR: %[[PhiRes:.*]] = OpPhi %[[Int]] %[[A]] %[[#]] %[[Val]] %[[#]]
+; NOLSR: %[[AggRes:.*]] = OpIAddCarry %[[Struct]] %[[PhiRes]] %[[Const1]]
+; NOLSR: %[[Val]] = OpCompositeExtract %[[Int]] %[[AggRes]] 0
+; NOLSR: %[[Over:.*]] = OpCompositeExtract %[[Int]] %[[AggRes]] 1
+; NOLSR: %[[IsOver]] = OpINotEqual %[[Bool:.*]] %[[Over]] %[[Zero]]
+; NOLSR: OpBranchConditional %[[IsOver]] %[[#]] %[[#]]
+; NOLSR: OpStore %[[Ptr]] %[[Const42]] Aligned 1
+; NOLSR: OpBranch %[[#]]
+; NOLSR: %[[#]] = OpLabel
+; NOLSR: OpReturnValue %[[Val]]
+; NOLSR: OpFunctionEnd
 
 define spir_func i32 @foo(i32 %a, ptr addrspace(4) %p) {
 entry:
diff --git a/llvm/test/CodeGen/SystemZ/liverangeedit-kill-memop.mir b/llvm/test/CodeGen/SystemZ/liverangeedit-kill-memop.mir
new file mode 100644
index 0000000..8cfa22d
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/liverangeedit-kill-memop.mir
@@ -0,0 +1,29 @@
+# RUN: llc -o /dev/null %s -mtriple=s390x-linux-gnu -mcpu=z16 \
+# RUN:   -verify-machineinstrs -run-pass=register-coalescer
+
+# The LOCMux below produces a dead definition and will be turned into
+# a KILL instruction (by LiveRangeEdit::eliminateDeadDef()). When this
+# happens, the MemoryOperand must also be removed as this is required
+# by the machine verifier.
+
+---
+name:            fun
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $r2d
+  
+    %3:addr64bit = COPY killed $r2d
+  
+  bb.1:
+    %5:grx32bit = LMux killed %3, 0, $noreg :: (load (s32))
+    CHIMux killed %5, 0, implicit-def $cc
+    %7:grx32bit = LHIMux 0
+    %1:grx32bit = COPY killed %7
+    %1:grx32bit = LOCMux %1, undef %8:addr64bit, 0, 14, 6, implicit killed $cc :: (load (s32))
+    dead %0:grx32bit = COPY killed %1
+  
+  bb.2:
+    J %bb.2
+
+...
diff --git a/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll
index e3d65a3..304605d 100644
--- a/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll
@@ -27,8 +27,8 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    sbfx r1, r0, #0, #4
 ; CHECK-NEXT:    add.w r1, r1, r1, lsl #1
-; CHECK-NEXT:    ubfx r2, r1, #7, #1
-; CHECK-NEXT:    add.w r1, r2, r1, lsr #4
+; CHECK-NEXT:    lsrs r2, r1, #4
+; CHECK-NEXT:    add.w r1, r2, r1, lsr #31
 ; CHECK-NEXT:    add.w r1, r1, r1, lsl #1
 ; CHECK-NEXT:    sub.w r0, r0, r1, lsl #1
 ; CHECK-NEXT:    and r0, r0, #15
diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll
index 2a52b9e..4efde4b 100644
--- a/llvm/test/CodeGen/X86/combine-pmuldq.ll
+++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll
@@ -203,46 +203,39 @@ define i32 @PR43159(ptr %a0) {
 ; SSE-LABEL: PR43159:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $1, %xmm1
-; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE-NEXT:    psubd %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    psrld $1, %xmm2
+; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7]
+; SSE-NEXT:    psubd %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
-; SSE-NEXT:    paddd %xmm1, %xmm2
-; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    paddd %xmm1, %xmm0
 ; SSE-NEXT:    psrld $7, %xmm0
-; SSE-NEXT:    psrld $6, %xmm2
-; SSE-NEXT:    movd %xmm2, %edi
+; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE-NEXT:    psrld $6, %xmm1
+; SSE-NEXT:    movd %xmm1, %edi
 ; SSE-NEXT:    pextrd $1, %xmm0, %esi
-; SSE-NEXT:    pextrd $2, %xmm2, %edx
+; SSE-NEXT:    pextrd $2, %xmm1, %edx
 ; SSE-NEXT:    pextrd $3, %xmm0, %ecx
 ; SSE-NEXT:    jmp foo # TAILCALL
 ;
 ; AVX1-LABEL: PR43159:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrld $7, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7]
 ; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
-; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $7, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX1-NEXT:    vpsrld $6, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %edi
 ; AVX1-NEXT:    vpextrd $1, %xmm1, %esi
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 2b392e6..2f19d14 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -2657,37 +2657,36 @@ define <8 x i16> @combine_vec_sdiv_nonuniform4(<8 x i16> %x) {
 define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) {
 ; SSE2-LABEL: combine_vec_sdiv_nonuniform5:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1]
-; SSE2-NEXT:    pmullw %xmm0, %xmm1
-; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32639,54613,19945,21846,2979,5243,32897,32833]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [32639,54613,19945,21846,2979,5243,32897,32833]
+; SSE2-NEXT:    pmulhw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0]
+; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [65535,0,65535,0,0,0,1,1]
 ; SSE2-NEXT:    paddw %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0]
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psraw $8, %xmm3
-; SSE2-NEXT:    pandn %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535]
 ; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psraw $4, %xmm3
+; SSE2-NEXT:    psraw $8, %xmm3
 ; SSE2-NEXT:    pandn %xmm3, %xmm2
 ; SSE2-NEXT:    por %xmm1, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535]
-; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    pand %xmm1, %xmm3
-; SSE2-NEXT:    psraw $2, %xmm2
-; SSE2-NEXT:    pandn %xmm2, %xmm1
-; SSE2-NEXT:    por %xmm3, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,0,65535]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,65535,0,65535]
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psraw $4, %xmm3
+; SSE2-NEXT:    pandn %xmm3, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,0,65535]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
 ; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    psraw $1, %xmm1
+; SSE2-NEXT:    psraw $2, %xmm1
 ; SSE2-NEXT:    pandn %xmm1, %xmm2
 ; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,65535]
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    psraw $1, %xmm2
+; SSE2-NEXT:    pandn %xmm2, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
 ; SSE2-NEXT:    psrlw $15, %xmm0
-; SSE2-NEXT:    paddw %xmm2, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_vec_sdiv_nonuniform5:
@@ -2695,41 +2694,40 @@ define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) {
 ; SSE41-NEXT:    pmovsxbw {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1]
 ; SSE41-NEXT:    pmullw %xmm0, %xmm1
 ; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32639,54613,19945,21846,2979,5243,32897,32833]
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [256,16384,4096,u,u,u,512,256]
+; SSE41-NEXT:    pmulhw %xmm1, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
+; SSE41-NEXT:    psraw $1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5],xmm2[6,7]
+; SSE41-NEXT:    psrlw $15, %xmm1
 ; SSE41-NEXT:    paddw %xmm1, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [256,16384,4096,u,u,u,512,256]
-; SSE41-NEXT:    pmulhw %xmm0, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
-; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    psraw $1, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
-; SSE41-NEXT:    psrlw $15, %xmm0
-; SSE41-NEXT:    paddw %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: combine_vec_sdiv_nonuniform5:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,0,65535,0,0,0,1,1]
 ; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32639,54613,19945,21846,2979,5243,32897,32833]
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [256,16384,4096,u,u,u,512,256]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5],xmm2[6,7]
+; AVX1-NEXT:    vpsrlw $15, %xmm1, %xmm1
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [256,16384,4096,u,u,u,512,256]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
-; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
-; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: combine_vec_sdiv_nonuniform5:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,0,65535,0,0,0,1,1]
 ; AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32639,54613,19945,21846,2979,5243,32897,32833]
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [256,16384,4096,u,u,u,512,256]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
+; AVX2-NEXT:    vpsraw $1, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5],xmm2[6,7]
+; AVX2-NEXT:    vpsrlw $15, %xmm1, %xmm1
 ; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [256,16384,4096,u,u,u,512,256]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
-; AVX2-NEXT:    vpsraw $1, %xmm0, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
-; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
-; AVX2-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform5:
@@ -2770,33 +2768,33 @@ define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) {
 define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) {
 ; SSE2-LABEL: combine_vec_sdiv_nonuniform6:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0]
-; SSE2-NEXT:    pmullw %xmm0, %xmm1
-; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32767,32767,32703,0,0,32897,32769,16385]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,1,1,1,0]
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
+; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32767,32767,32703,0,0,32897,32769,16385]
 ; SSE2-NEXT:    paddw %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535]
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    psraw $8, %xmm2
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psraw $6, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,0,65535,65535]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,0,65535,0]
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    psraw $12, %xmm5
-; SSE2-NEXT:    pandn %xmm5, %xmm4
-; SSE2-NEXT:    por %xmm1, %xmm4
-; SSE2-NEXT:    pand %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm2, %xmm3
-; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psraw $8, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    psraw $6, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,0,65535,65535]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,0,65535,0]
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    psraw $12, %xmm1
+; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm3, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,0]
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
 ; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    psraw $1, %xmm3
-; SSE2-NEXT:    pandn %xmm3, %xmm1
+; SSE2-NEXT:    psraw $1, %xmm4
+; SSE2-NEXT:    pandn %xmm4, %xmm1
 ; SSE2-NEXT:    por %xmm2, %xmm1
 ; SSE2-NEXT:    psrlw $15, %xmm0
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2809,13 +2807,13 @@ define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) {
 ; SSE41-NEXT:    pmullw %xmm0, %xmm1
 ; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32767,32767,32703,0,0,32897,32769,16385]
 ; SSE41-NEXT:    paddw %xmm1, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4,256,256,u,u,512,256,8]
-; SSE41-NEXT:    pmulhw %xmm0, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
-; SSE41-NEXT:    psrlw $15, %xmm0
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
-; SSE41-NEXT:    paddw %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlw $15, %xmm2
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
+; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,256,256,u,u,512,256,8]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
+; SSE41-NEXT:    paddw %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: combine_vec_sdiv_nonuniform6:
@@ -2823,12 +2821,12 @@ define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) {
 ; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,65535,65535,65535,1,1,1,0]
 ; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32767,32767,32703,0,0,32897,32769,16385]
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [4,256,256,u,u,512,256,8]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
-; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
-; AVX1-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
+; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4,256,256,u,u,512,256,8]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
+; AVX1-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: combine_vec_sdiv_nonuniform6:
@@ -2836,12 +2834,12 @@ define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) {
 ; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,65535,65535,65535,1,1,1,0]
 ; AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32767,32767,32703,0,0,32897,32769,16385]
 ; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [4,256,256,u,u,512,256,8]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
-; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
-; AVX2-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm2
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
+; AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4,256,256,u,u,512,256,8]
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
+; AVX2-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: combine_vec_sdiv_nonuniform6:
@@ -2928,15 +2926,14 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
 ; SSE2-LABEL: pr38658:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,0,37632]
-; SSE2-NEXT:    psrlw $8, %xmm2
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    packuswb %xmm2, %xmm3
-; SSE2-NEXT:    paddb %xmm3, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,0,0,0,0,0,0,37632]
+; SSE2-NEXT:    psrlw $8, %xmm3
+; SSE2-NEXT:    packuswb %xmm3, %xmm1
+; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; SSE2-NEXT:    psraw $8, %xmm1
@@ -2956,20 +2953,21 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
 ; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,0,37632]
 ; SSE41-NEXT:    psrlw $8, %xmm2
 ; SSE41-NEXT:    packuswb %xmm2, %xmm1
-; SSE41-NEXT:    paddb %xmm1, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE41-NEXT:    psraw $8, %xmm1
+; SSE41-NEXT:    paddb %xmm0, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psllw $6, %xmm2
-; SSE41-NEXT:    psllw $8, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
-; SSE41-NEXT:    psrlw $8, %xmm1
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    packuswb %xmm1, %xmm2
-; SSE41-NEXT:    psrlw $7, %xmm0
-; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    paddb %xmm2, %xmm0
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE41-NEXT:    psraw $8, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psllw $6, %xmm3
+; SSE41-NEXT:    psllw $8, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7]
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    packuswb %xmm2, %xmm0
+; SSE41-NEXT:    psrlw $7, %xmm1
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: pr38658:
@@ -2979,18 +2977,18 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
 ; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,37632]
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpsllw $6, %xmm1, %xmm2
-; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
-; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
-; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $6, %xmm2, %xmm3
+; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7]
+; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm1
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: pr38658:
diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll
index d5a4815..5571519 100644
--- a/llvm/test/CodeGen/X86/combine-udiv.ll
+++ b/llvm/test/CodeGen/X86/combine-udiv.ll
@@ -412,19 +412,19 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
 ;
 ; AVX1-LABEL: combine_vec_udiv_by_shl_pow2b:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
-; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm4
+; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: combine_vec_udiv_by_shl_pow2b:
@@ -683,21 +683,19 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 define <8 x i16> @pr38477(<8 x i16> %a0) {
 ; SSE2-LABEL: pr38477:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [u,4957,57457,4103,16385,35545,2048,2115]
-; SSE2-NEXT:    pmulhuw %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    psubw %xmm1, %xmm2
-; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [u,32768,0,0,0,0,0,32768]
-; SSE2-NEXT:    paddw %xmm1, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,0,65535]
-; SSE2-NEXT:    pandn %xmm2, %xmm1
-; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    por %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [u,4957,57457,4103,16385,35545,2048,2115]
+; SSE2-NEXT:    pmulhuw %xmm0, %xmm3
+; SSE2-NEXT:    psubw %xmm3, %xmm0
+; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [u,32768,0,0,0,0,0,32768]
+; SSE2-NEXT:    paddw %xmm3, %xmm0
+; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: pr38477:
diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll
index b124bd5..dfae853 100644
--- a/llvm/test/CodeGen/X86/dpbusd_const.ll
+++ b/llvm/test/CodeGen/X86/dpbusd_const.ll
@@ -10,8 +10,8 @@ define i32 @mul_4xi8_zc_exceed(<4 x i8> %a, i32 %c) {
 ; ALL-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,128,0]
 ; ALL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; ALL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; ALL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; ALL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; ALL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; ALL-NEXT:    vmovd %xmm0, %eax
 ; ALL-NEXT:    addl %edi, %eax
 ; ALL-NEXT:    retq
@@ -55,7 +55,7 @@ define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) {
 ; AVX512VLVNNI-NEXT:    retq
 entry:
   %0 = zext <4 x i8> %a to <4 x i32>
-  %1 = mul nsw <4 x i32> %0, <i32 0, i32 1, i32 2, i32 127>
+  %1 = mul nsw <4 x i32> %0, <i32 16, i32 1, i32 2, i32 127>
   %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
   %op.extra = add nsw i32 %2, %c
   ret i32 %op.extra
@@ -97,7 +97,7 @@ define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) {
 ; AVX512VLVNNI-NEXT:    retq
 entry:
   %0 = zext <4 x i4> %a to <4 x i32>
-  %1 = mul nsw <4 x i32> <i32 0, i32 1, i32 2, i32 127>, %0
+  %1 = mul nsw <4 x i32> <i32 16, i32 1, i32 2, i32 127>, %0
   %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
   %op.extra = add nsw i32 %2, %c
   ret i32 %op.extra
@@ -108,7 +108,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
 ; AVXVNNI:       # %bb.0: # %entry
 ; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVXVNNI-NEXT:    vmovd {{.*#+}} xmm2 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVXVNNI-NEXT:    vmovd {{.*#+}} xmm2 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVXVNNI-NEXT:    {vex} vpdpbusd %xmm0, %xmm2, %xmm1
 ; AVXVNNI-NEXT:    vmovd %xmm1, %eax
 ; AVXVNNI-NEXT:    addl %edi, %eax
@@ -118,7 +118,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
 ; AVX512VNNI:       # %bb.0: # %entry
 ; AVX512VNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX512VNNI-NEXT:    vmovd {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VNNI-NEXT:    vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512VNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512VNNI-NEXT:    vpdpbusd %zmm0, %zmm1, %zmm2
 ; AVX512VNNI-NEXT:    vmovd %xmm2, %eax
@@ -130,7 +130,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
 ; AVX512VLVNNI:       # %bb.0: # %entry
 ; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VLVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX512VLVNNI-NEXT:    vmovd {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVNNI-NEXT:    vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512VLVNNI-NEXT:    vpdpbusd %xmm0, %xmm1, %xmm2
 ; AVX512VLVNNI-NEXT:    vmovd %xmm2, %eax
@@ -138,7 +138,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
 ; AVX512VLVNNI-NEXT:    retq
 entry:
   %0 = sext <4 x i8> %a to <4 x i32>
-  %1 = mul nsw <4 x i32> <i32 0, i32 1, i32 2, i32 255>, %0
+  %1 = mul nsw <4 x i32> <i32 16, i32 1, i32 2, i32 255>, %0
   %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
   %op.extra = add nsw i32 %2, %c
   ret i32 %op.extra
@@ -151,8 +151,8 @@ define i32 @mul_4xi8_cs_exceed(<4 x i8> %a, i32 %c) {
 ; ALL-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,256,0]
 ; ALL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; ALL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; ALL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; ALL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; ALL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; ALL-NEXT:    vmovd %xmm0, %eax
 ; ALL-NEXT:    addl %edi, %eax
 ; ALL-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll
index 1b13cee..e595b3f 100644
--- a/llvm/test/CodeGen/X86/pr62286.ll
+++ b/llvm/test/CodeGen/X86/pr62286.ll
@@ -8,18 +8,17 @@ define i64 @PR62286(i32 %a) {
 ; SSE-LABEL: PR62286:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movd %edi, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,1,0]
-; SSE-NEXT:    paddd %xmm0, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,0]
+; SSE-NEXT:    paddd %xmm1, %xmm1
 ; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE-NEXT:    pxor %xmm3, %xmm3
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT:    paddq %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,0]
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; SSE-NEXT:    paddq %xmm1, %xmm0
 ; SSE-NEXT:    movq %xmm0, %rax
 ; SSE-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/pr67333.ll b/llvm/test/CodeGen/X86/pr67333.ll
index 64c7f4f..9463809 100644
--- a/llvm/test/CodeGen/X86/pr67333.ll
+++ b/llvm/test/CodeGen/X86/pr67333.ll
@@ -18,42 +18,42 @@ define void @SHA256_Compress_Generic(ptr noundef %ctx) #1 {
 ; CHECK-NEXT:    vpsrld $19, %xmm2, %xmm3
 ; CHECK-NEXT:    vpslld $13, %xmm2, %xmm4
 ; CHECK-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; CHECK-NEXT:    vpxor %xmm3, %xmm0, %xmm0
-; CHECK-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; CHECK-NEXT:    vmovd %ecx, %xmm3
-; CHECK-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; CHECK-NEXT:    vpxor %xmm3, %xmm0, %xmm3
+; CHECK-NEXT:    vpxor %xmm2, %xmm3, %xmm0
+; CHECK-NEXT:    vmovd %ecx, %xmm4
+; CHECK-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
 ; CHECK-NEXT:    vpsrld $17, %xmm1, %xmm0
-; CHECK-NEXT:    vpslld $15, %xmm1, %xmm3
-; CHECK-NEXT:    vpor %xmm0, %xmm3, %xmm0
-; CHECK-NEXT:    vpsrld $19, %xmm1, %xmm3
-; CHECK-NEXT:    vpslld $13, %xmm1, %xmm4
-; CHECK-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; CHECK-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; CHECK-NEXT:    vpslld $15, %xmm1, %xmm4
+; CHECK-NEXT:    vpor %xmm0, %xmm4, %xmm0
+; CHECK-NEXT:    vpsrld $19, %xmm1, %xmm4
+; CHECK-NEXT:    vpslld $13, %xmm1, %xmm5
+; CHECK-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; CHECK-NEXT:    vpxor %xmm4, %xmm0, %xmm0
 ; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
-; CHECK-NEXT:    vpsrld $17, %xmm0, %xmm3
-; CHECK-NEXT:    vpslld $15, %xmm0, %xmm4
-; CHECK-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; CHECK-NEXT:    vpsrld $19, %xmm0, %xmm4
-; CHECK-NEXT:    vpslld $13, %xmm0, %xmm5
+; CHECK-NEXT:    vpsrld $17, %xmm0, %xmm4
+; CHECK-NEXT:    vpslld $15, %xmm0, %xmm5
 ; CHECK-NEXT:    vpor %xmm4, %xmm5, %xmm4
-; CHECK-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; CHECK-NEXT:    vpsrld $19, %xmm0, %xmm5
+; CHECK-NEXT:    vpslld $13, %xmm0, %xmm6
+; CHECK-NEXT:    vpor %xmm5, %xmm6, %xmm5
+; CHECK-NEXT:    vpxor %xmm5, %xmm4, %xmm4
 ; CHECK-NEXT:    vpsrld $10, %xmm0, %xmm0
-; CHECK-NEXT:    vpxor %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vpxor %xmm0, %xmm4, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
-; CHECK-NEXT:    vpsrld $17, %xmm0, %xmm3
-; CHECK-NEXT:    vpslld $15, %xmm0, %xmm4
-; CHECK-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; CHECK-NEXT:    vpsrld $19, %xmm0, %xmm4
-; CHECK-NEXT:    vpslld $13, %xmm0, %xmm5
+; CHECK-NEXT:    vpsrld $17, %xmm0, %xmm4
+; CHECK-NEXT:    vpslld $15, %xmm0, %xmm5
 ; CHECK-NEXT:    vpor %xmm4, %xmm5, %xmm4
-; CHECK-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; CHECK-NEXT:    vpsrld $10, %xmm0, %xmm4
-; CHECK-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3]
+; CHECK-NEXT:    vpsrld $19, %xmm0, %xmm5
+; CHECK-NEXT:    vpslld $13, %xmm0, %xmm6
+; CHECK-NEXT:    vpor %xmm5, %xmm6, %xmm5
+; CHECK-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; CHECK-NEXT:    vpsrld $10, %xmm0, %xmm5
+; CHECK-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3]
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,3]
-; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; CHECK-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
 ; CHECK-NEXT:    vpsrld $17, %xmm2, %xmm3
 ; CHECK-NEXT:    vpslld $15, %xmm2, %xmm4
 ; CHECK-NEXT:    vpor %xmm3, %xmm4, %xmm3
diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll
index 2455169..c74440d 100644
--- a/llvm/test/CodeGen/X86/sad.ll
+++ b/llvm/test/CodeGen/X86/sad.ll
@@ -986,26 +986,56 @@ define dso_local i32 @sad_unroll_nonzero_initial(ptr %arg, ptr %arg1, ptr %arg2,
 ; SSE2-NEXT:    movdqu (%rcx), %xmm2
 ; SSE2-NEXT:    psadbw %xmm0, %xmm2
 ; SSE2-NEXT:    paddd %xmm1, %xmm2
-; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
 ; SSE2-NEXT:    paddd %xmm2, %xmm0
 ; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    incl %eax
 ; SSE2-NEXT:    retq
 ;
-; AVX-LABEL: sad_unroll_nonzero_initial:
-; AVX:       # %bb.0: # %bb
-; AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
-; AVX-NEXT:    vmovdqu (%rdx), %xmm1
-; AVX-NEXT:    vpsadbw (%rcx), %xmm1, %xmm1
-; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    retq
+; AVX1-LABEL: sad_unroll_nonzero_initial:
+; AVX1:       # %bb.0: # %bb
+; AVX1-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX1-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqu (%rdx), %xmm1
+; AVX1-NEXT:    vpsadbw (%rcx), %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sad_unroll_nonzero_initial:
+; AVX2:       # %bb.0: # %bb
+; AVX2-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX2-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqu (%rdx), %xmm1
+; AVX2-NEXT:    vpsadbw (%rcx), %xmm1, %xmm1
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sad_unroll_nonzero_initial:
+; AVX512:       # %bb.0: # %bb
+; AVX512-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX512-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu (%rdx), %xmm1
+; AVX512-NEXT:    vpsadbw (%rcx), %xmm1, %xmm1
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
 bb:
   %tmp = load <16 x i8>, ptr %arg, align 1
   %tmp4 = load <16 x i8>, ptr %arg1, align 1
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index a2bcadd..08d9183 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -2068,14 +2068,12 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psrlq $32, %xmm0
-; CHECK-SSE2-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    psrlq $32, %xmm1
+; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    por %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2141,14 +2139,12 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psrlq $32, %xmm0
-; CHECK-SSE2-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    psrlq $32, %xmm1
+; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    por %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index 28b3d7c..fd0525e 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -2407,7 +2407,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512F-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -2416,14 +2416,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VL-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512BW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -2432,7 +2432,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VBMI2-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
 ; AVX512VBMI2-NEXT:    retq
@@ -2441,14 +2441,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VLBW-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VLBW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
 ; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_funnnel_v16i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index fca72ca..193e570 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -2301,7 +2301,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -2309,14 +2309,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v32i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512BW-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512BW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -2324,7 +2324,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512VBMI2-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    retq
 ;
@@ -2332,14 +2332,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VLBW-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512VLBW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX10-LABEL: splatconstant_funnnel_v32i8:
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX10-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX10-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0
+; AVX10-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
 ; AVX10-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll
index e238553..1d807fa 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll
@@ -1124,7 +1124,7 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
@@ -1137,35 +1137,35 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512VBMI2-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512VLBW-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512VLBW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512VLVBMI2-NEXT:    retq
   %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
   ret <64 x i8> %res
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
index 6d02801..6c3e4a9 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
@@ -1859,7 +1859,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512F-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -1868,14 +1868,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512BW-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1884,14 +1884,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
 ; AVX512VBMI2-NEXT:    retq
@@ -1900,7 +1900,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
 ; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_funnnel_v16i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
index 8071150..47524b2 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -447,12 +447,12 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm3
-; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
 ; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsrlw $6, %ymm0, %ymm3
-; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm2
@@ -467,17 +467,17 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
 ; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $6, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & mem)
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -1651,7 +1651,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -1659,14 +1659,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v32i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512BW-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -1674,14 +1674,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    retq
 ;
@@ -1689,7 +1689,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VLVBMI2-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
 ; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
index edfa56a..436fbe3 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
@@ -132,14 +132,14 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
 ; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm3
 ; AVX512F-NEXT:    vpsllw $4, %ymm2, %ymm4
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160]
-; AVX512F-NEXT:    vpternlogd $226, %zmm3, %zmm5, %zmm4
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3))
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
 ; AVX512F-NEXT:    vpsllw $5, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $6, %ymm2, %ymm4
 ; AVX512F-NEXT:    vpsllw $2, %ymm2, %ymm6
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm7 = [4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268]
-; AVX512F-NEXT:    vpternlogd $226, %zmm4, %zmm7, %zmm6
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm6 = zmm4 ^ (zmm7 & (zmm6 ^ zmm4))
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $7, %ymm2, %ymm4
@@ -151,12 +151,12 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
 ; AVX512F-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm4
-; AVX512F-NEXT:    vpternlogd $226, %zmm3, %zmm5, %zmm4
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3))
 ; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $6, %ymm0, %ymm3
 ; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm4
-; AVX512F-NEXT:    vpternlogd $226, %zmm3, %zmm7, %zmm4
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm7 & (zmm4 ^ zmm3))
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm3
@@ -174,35 +174,35 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm3
 ; AVX512VL-NEXT:    vpsllw $4, %ymm2, %ymm4
 ; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160]
-; AVX512VL-NEXT:    vpternlogd $226, %ymm3, %ymm5, %ymm4
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm5 & (ymm4 ^ ymm3))
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
 ; AVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $6, %ymm2, %ymm4
 ; AVX512VL-NEXT:    vpsllw $2, %ymm2, %ymm6
 ; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268]
-; AVX512VL-NEXT:    vpternlogd $226, %ymm4, %ymm7, %ymm6
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm6 = ymm4 ^ (ymm7 & (ymm6 ^ ymm4))
 ; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $7, %ymm2, %ymm4
 ; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm6
 ; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512VL-NEXT:    vpternlogq $248, %ymm8, %ymm4, %ymm6
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm4 & ymm8)
 ; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm4
-; AVX512VL-NEXT:    vpternlogd $226, %ymm3, %ymm5, %ymm4
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm5 & (ymm4 ^ ymm3))
 ; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $6, %ymm0, %ymm3
 ; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm4
-; AVX512VL-NEXT:    vpternlogd $226, %ymm3, %ymm7, %ymm4
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm7 & (ymm4 ^ ymm3))
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm3
 ; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm4
-; AVX512VL-NEXT:    vpternlogq $248, %ymm8, %ymm3, %ymm4
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm3 & ymm8)
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
@@ -721,7 +721,7 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
@@ -733,35 +733,35 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512VLVBMI2-NEXT:    retq
   %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
   ret <64 x i8> %res
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 7903781..9ba7047 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -2411,7 +2411,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512F-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -2420,14 +2420,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VL-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512BW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -2436,7 +2436,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VBMI2-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
 ; AVX512VBMI2-NEXT:    retq
@@ -2445,14 +2445,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VLBW-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VLBW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
 ; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_funnnel_v16i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 1f3ffc7d..9c259ed 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -1706,7 +1706,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
 ; AVX512VL-NEXT:    vpsllw $8, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpternlogq $254, %ymm2, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | ymm1 | ymm2
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: constant_funnnel_v32i8:
@@ -2088,7 +2088,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -2096,14 +2096,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v32i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512BW-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512BW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -2111,7 +2111,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512VBMI2-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    retq
 ;
@@ -2119,14 +2119,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VLBW-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512VLBW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX10-LABEL: splatconstant_funnnel_v32i8:
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX10-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX10-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0
+; AVX10-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
 ; AVX10-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll
index 5e19b67..6652231 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll
@@ -921,7 +921,7 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
 ; AVX512F-NEXT:    vpmaddubsw %ymm4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
 ; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogq $254, %zmm3, %zmm1, %zmm0
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm3
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: constant_funnnel_v64i8:
@@ -963,7 +963,7 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
 ; AVX512VL-NEXT:    vpmaddubsw %ymm4, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogq $254, %zmm3, %zmm1, %zmm0
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm3
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: constant_funnnel_v64i8:
@@ -1166,7 +1166,7 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
@@ -1179,35 +1179,35 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512VBMI2-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512VLBW-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512VLBW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512VLVBMI2-NEXT:    retq
   %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
   ret <64 x i8> %res
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index e1292ae..4e30d70 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -1928,7 +1928,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512F-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -1937,14 +1937,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512BW-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1953,14 +1953,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
 ; AVX512VBMI2-NEXT:    retq
@@ -1969,7 +1969,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
 ; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_funnnel_v16i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index 504ba58..bc2ea68 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -473,17 +473,17 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm3
-; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
 ; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $2, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsllw $6, %ymm0, %ymm3
-; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm3
-; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -492,17 +492,17 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
 ; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $2, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsllw $6, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -1702,7 +1702,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -1710,14 +1710,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v32i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512BW-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -1725,14 +1725,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    retq
 ;
@@ -1740,7 +1740,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VLVBMI2-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
 ; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
index 4364c04..11ea650 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
@@ -132,35 +132,35 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
 ; AVX512F-NEXT:    vpsllw $4, %ymm2, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm4
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm5 = [252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135]
-; AVX512F-NEXT:    vpternlogd $226, %zmm3, %zmm5, %zmm4
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3))
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
 ; AVX512F-NEXT:    vpsllw $5, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsllw $6, %ymm2, %ymm4
 ; AVX512F-NEXT:    vpsrlw $2, %ymm2, %ymm6
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm7 = [1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567]
-; AVX512F-NEXT:    vpternlogd $226, %zmm4, %zmm7, %zmm6
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm6 = zmm4 ^ (zmm7 & (zmm6 ^ zmm4))
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsllw $7, %ymm2, %ymm4
 ; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm6
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm8 = [2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143]
-; AVX512F-NEXT:    vpternlogd $226, %zmm4, %zmm8, %zmm6
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm6 = zmm4 ^ (zmm8 & (zmm6 ^ zmm4))
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm3
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512F-NEXT:    vpternlogd $226, %zmm3, %zmm5, %zmm4
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3))
 ; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsllw $6, %ymm0, %ymm3
 ; AVX512F-NEXT:    vpsrlw $2, %ymm0, %ymm4
-; AVX512F-NEXT:    vpternlogd $226, %zmm3, %zmm7, %zmm4
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm7 & (zmm4 ^ zmm3))
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm3
 ; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm4
-; AVX512F-NEXT:    vpternlogd $226, %zmm3, %zmm8, %zmm4
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm8 & (zmm4 ^ zmm3))
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
@@ -172,35 +172,35 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
 ; AVX512VL-NEXT:    vpsllw $4, %ymm2, %ymm3
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm4
 ; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135]
-; AVX512VL-NEXT:    vpternlogd $226, %ymm3, %ymm5, %ymm4
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm5 & (ymm4 ^ ymm3))
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
 ; AVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpsllw $6, %ymm2, %ymm4
 ; AVX512VL-NEXT:    vpsrlw $2, %ymm2, %ymm6
 ; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567]
-; AVX512VL-NEXT:    vpternlogd $226, %ymm4, %ymm7, %ymm6
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm6 = ymm4 ^ (ymm7 & (ymm6 ^ ymm4))
 ; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpsllw $7, %ymm2, %ymm4
 ; AVX512VL-NEXT:    vpsrlw $1, %ymm2, %ymm6
 ; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143]
-; AVX512VL-NEXT:    vpternlogd $226, %ymm4, %ymm8, %ymm6
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm6 = ymm4 ^ (ymm8 & (ymm6 ^ ymm4))
 ; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm3
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
-; AVX512VL-NEXT:    vpternlogd $226, %ymm3, %ymm5, %ymm4
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm5 & (ymm4 ^ ymm3))
 ; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsllw $6, %ymm0, %ymm3
 ; AVX512VL-NEXT:    vpsrlw $2, %ymm0, %ymm4
-; AVX512VL-NEXT:    vpternlogd $226, %ymm3, %ymm7, %ymm4
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm7 & (ymm4 ^ ymm3))
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm3
 ; AVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm4
-; AVX512VL-NEXT:    vpternlogd $226, %ymm3, %ymm8, %ymm4
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm8 & (ymm4 ^ ymm3))
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
@@ -721,7 +721,7 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
@@ -733,35 +733,35 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
 ; AVX512VLVBMI2-NEXT:    retq
   %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
   ret <64 x i8> %res
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
index ef6129c..83ccfd7 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
@@ -651,7 +651,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0,38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
 ; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38,0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
 ; AVX512BW-NEXT:    vpsllw $8, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm2 & mem)
 ; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %res = urem <64 x i8> %a, <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7>
diff --git a/llvm/test/Linker/pr22807.ll b/llvm/test/Linker/pr22807.ll
index 1db1eab..a1fe384 100644
--- a/llvm/test/Linker/pr22807.ll
+++ b/llvm/test/Linker/pr22807.ll
@@ -1,9 +1,6 @@
-; RUN: llvm-link -S -o - %p/pr22807.ll %p/Inputs/pr22807-1.ll %p/Inputs/pr22807-2.ll | FileCheck %s
+; RUN: not llvm-link -S -o - %p/pr22807.ll %p/Inputs/pr22807-1.ll %p/Inputs/pr22807-2.ll 2>&1 | FileCheck %s
 
-; CHECK-NOT: type
-; CHECK: %struct.B = type { %struct.A }
-; CHECK-NEXT: %struct.A = type { %struct.B }
-; CHECK-NOT: type
+; CHECK: error: identified structure type 'struct.A' is recursive
 
 %struct.B = type { %struct.A }
 %struct.A = type opaque
diff --git a/llvm/test/MC/AArch64/SME/revd-diagnostics.s b/llvm/test/MC/AArch64/SME/revd-diagnostics.s
index e7242e5..6fd9af4 100644
--- a/llvm/test/MC/AArch64/SME/revd-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME/revd-diagnostics.s
@@ -27,3 +27,12 @@ revd z0.q, p0/m, z0.s
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
 // CHECK-NEXT: revd z0.q, p0/m, z0.s
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z21, z25
+revd    z21.q, p5/m, z10.q
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: revd    z21.q, p5/m, z10.q
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SME/revd.s b/llvm/test/MC/AArch64/SME/revd.s
index dc73da1..ad8a6cf 100644
--- a/llvm/test/MC/AArch64/SME/revd.s
+++ b/llvm/test/MC/AArch64/SME/revd.s
@@ -34,19 +34,4 @@ revd    z31.q, p7/m, z31.q
 // CHECK-INST: revd    z31.q, p7/m, z31.q
 // CHECK-ENCODING: [0xff,0x9f,0x2e,0x05]
 // CHECK-ERROR: instruction requires: sme
-// CHECK-UNKNOWN: 052e9fff <unknown>
-
-// --------------------------------------------------------------------------//
-// Test compatibility with MOVPRFX instruction.
-
-movprfx z21, z25
-// CHECK-INST: movprfx  z21, z25
-// CHECK-ENCODING: [0x35,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: sve or sme
-// CHECK-UNKNOWN: 0420bf35 <unknown>
-
-revd    z21.q, p5/m, z10.q
-// CHECK-INST: revd    z21.q, p5/m, z10.q
-// CHECK-ENCODING: [0x55,0x95,0x2e,0x05]
-// CHECK-ERROR: instruction requires: sme
-// CHECK-UNKNOWN: 052e9555 <unknown>
+// CHECK-UNKNOWN: 052e9fff <unknown>
+\ No newline at end of file
diff --git a/llvm/test/MC/Disassembler/X86/amx-transpose-att.s b/llvm/test/MC/X86/amx-transpose-att.s
index da3fa95..da3fa95 100644
--- a/llvm/test/MC/Disassembler/X86/amx-transpose-att.s
+++ b/llvm/test/MC/X86/amx-transpose-att.s
diff --git a/llvm/test/MC/Disassembler/X86/amx-transpose-intel.s b/llvm/test/MC/X86/amx-transpose-intel.s
index 3b8dfae..3b8dfae 100644
--- a/llvm/test/MC/Disassembler/X86/amx-transpose-intel.s
+++ b/llvm/test/MC/X86/amx-transpose-intel.s
diff --git a/llvm/test/Transforms/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll b/llvm/test/Transforms/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
index 8db0a28..ef362f9 100644
--- a/llvm/test/Transforms/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
@@ -4,7 +4,7 @@
 ; dbg.value which still used the removed argument.
 
 ; The %p argument should be removed, and the use of it in dbg.value should be
-; changed to undef.
+; changed to poison.
 
 %fun_t = type ptr
 define void @foo() {
@@ -15,7 +15,7 @@ define void @foo() {
 
 define internal void @bar(ptr %p) {
 ; CHECK-LABEL: define {{.*}}void @bar()
-; CHECK-NEXT:    #dbg_value(ptr undef, !3, !DIExpression(), !5
+; CHECK-NEXT:    #dbg_value(ptr poison, !3, !DIExpression(), !5
   call void @llvm.dbg.value(metadata ptr %p, metadata !3, metadata !DIExpression()), !dbg !5
   ret void
 }
diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-once-value.ll b/llvm/test/Transforms/Coroutines/coro-retcon-once-value.ll
index 2ee515f..b2da9af 100644
--- a/llvm/test/Transforms/Coroutines/coro-retcon-once-value.ll
+++ b/llvm/test/Transforms/Coroutines/coro-retcon-once-value.ll
@@ -184,7 +184,7 @@ declare void @print(i32)
 ; CHECK-NEXT:    [[VAL_RELOAD:%.*]] = load i32, ptr [[VAL_RELOAD_ADDR]], align 4
 ; CHECK-NEXT:    [[NEW_VAL:%.*]] = add i32 [[VAL_RELOAD]], 123
 ; CHECK-NEXT:    tail call void @deallocate(ptr [[TMP2]])
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { ptr, i32, ptr } { ptr null, i32 undef, ptr undef }, i32 [[NEW_VAL]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { ptr, i32, ptr } { ptr null, i32 poison, ptr poison }, i32 [[NEW_VAL]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { ptr, i32, ptr } [[TMP3]], ptr @deallocate, 2
 ; CHECK-NEXT:    ret { ptr, i32, ptr } [[TMP4]]
 ;
@@ -202,7 +202,7 @@ declare void @print(i32)
 ; CHECK-NEXT:    [[VAL_RELOAD:%.*]] = load i32, ptr [[VAL_RELOAD_ADDR]], align 4
 ; CHECK-NEXT:    [[NEW_VAL:%.*]] = add i32 [[VAL_RELOAD]], 123
 ; CHECK-NEXT:    tail call void @deallocate(ptr [[TMP2]])
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { ptr, i32, ptr } { ptr null, i32 undef, ptr undef }, i32 [[NEW_VAL]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { ptr, i32, ptr } { ptr null, i32 poison, ptr poison }, i32 [[NEW_VAL]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { ptr, i32, ptr } [[TMP3]], ptr @deallocate, 2
 ; CHECK-NEXT:    ret { ptr, i32, ptr } [[TMP4]]
 ;
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/urem.ll b/llvm/test/Transforms/CorrelatedValuePropagation/urem.ll
index ec6461e..e69deaa 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/urem.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/urem.ll
@@ -462,4 +462,13 @@ join:
   ret i8 %res
 }
 
+define i1 @urem_i1() {
+; CHECK-LABEL: @urem_i1(
+; CHECK-NEXT:    [[REM:%.*]] = urem i1 false, false
+; CHECK-NEXT:    ret i1 [[REM]]
+;
+  %rem = urem i1 false, false
+  ret i1 %rem
+}
+
 declare void @use(i1)
diff --git a/llvm/test/Transforms/InstCombine/trunc-shl-zext.ll b/llvm/test/Transforms/InstCombine/trunc-shl-zext.ll
index 2e7b6d8..576125b 100644
--- a/llvm/test/Transforms/InstCombine/trunc-shl-zext.ll
+++ b/llvm/test/Transforms/InstCombine/trunc-shl-zext.ll
@@ -1,12 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
+target datalayout = "n8:16:32:64"
+
 define i32 @trunc_shl_zext_32(i32 %a) {
 ; CHECK-LABEL: define i32 @trunc_shl_zext_32
 ; CHECK-SAME: (i32 [[A:%.*]]) {
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[A]] to i16
-; CHECK-NEXT:    [[SHL:%.*]] = shl i16 [[TRUNC]], 4
-; CHECK-NEXT:    [[EXT:%.*]] = zext i16 [[SHL]] to i32
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[A]], 4
+; CHECK-NEXT:    [[EXT:%.*]] = and i32 [[SHL]], 65520
 ; CHECK-NEXT:    ret i32 [[EXT]]
 ;
   %trunc = trunc i32 %a to i16
@@ -18,9 +19,8 @@ define i32 @trunc_shl_zext_32(i32 %a) {
 define i64 @trunc_shl_zext_64(i64 %a) {
 ; CHECK-LABEL: define i64 @trunc_shl_zext_64
 ; CHECK-SAME: (i64 [[A:%.*]]) {
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[A]] to i8
-; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[TRUNC]], 7
-; CHECK-NEXT:    [[EXT:%.*]] = zext i8 [[SHL]] to i64
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[A]], 7
+; CHECK-NEXT:    [[EXT:%.*]] = and i64 [[SHL]], 128
 ; CHECK-NEXT:    ret i64 [[EXT]]
 ;
   %trunc = trunc i64 %a to i8
diff --git a/llvm/test/Transforms/LoopVectorize/branch-weights.ll b/llvm/test/Transforms/LoopVectorize/branch-weights.ll
index cf96469..db2c818 100644
--- a/llvm/test/Transforms/LoopVectorize/branch-weights.ll
+++ b/llvm/test/Transforms/LoopVectorize/branch-weights.ll
@@ -75,8 +75,8 @@ exit:
 
 ; CHECK: [[PROF_F0_ENTRY]] = !{!"branch_weights", i32 12, i32 1}
 ; CHECK: [[PROF_F0_UNLIKELY]] = !{!"branch_weights", i32 1, i32 127}
-; CEHCK: [[PROF_F0_VECTOR_BODY]] = !{!"branch_weights", i32 1, i32 307}
+; CHECK: [[PROF_F0_VECTOR_BODY]] = !{!"branch_weights", i32 1, i32 307}
 ; CHECK: [[PROF_F0_MIDDLE_BLOCKS]] =  !{!"branch_weights", i32 1, i32 3}
 ; CHECK: [[PROF_F0_VEC_EPILOGUE_SKIP]] = !{!"branch_weights", i32 4, i32 0}
 ; CHECK: [[PROF_F0_VEC_EPILOG_VECTOR_BODY]] = !{!"branch_weights", i32 0, i32 0}
-; CEHCK: [[PROF_F0_LOOP]] = !{!"branch_weights", i32 2, i32 1}
+; CHECK: [[PROF_F0_LOOP]] = !{!"branch_weights", i32 2, i32 1}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/interleave-greater-than-slice.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/interleave-greater-than-slice.ll
new file mode 100644
index 0000000..4a8b851
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/interleave-greater-than-slice.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux -mattr=+v,+zvl128b < %s | FileCheck %s
+
+define void @test(ptr %a, float %0) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[A:%.*]], float [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[TMP1]], i64 84
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP2]], float 0.000000e+00, float 0.000000e+00)
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr i8, ptr [[TMP1]], i64 28
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP4]], float 0.000000e+00, float [[TMP3]])
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP6]], float 0.000000e+00, float 0.000000e+00)
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 68
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP8]], float 0.000000e+00, float [[TMP5]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr i8, ptr [[TMP1]], i64 88
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP10]], float 0.000000e+00, float [[TMP7]])
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[TMP1]], i64 92
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP12]], float 0.000000e+00, float [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float [[TMP9]])
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr i8, ptr [[TMP1]], i64 96
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP15]], float 0.000000e+00, float [[TMP13]])
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr i8, ptr [[TMP1]], i64 80
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP17]], float [[TMP16]])
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[TMP1]], i64 100
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP19]], float 0.000000e+00, float [[TMP14]])
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP18]], [[TMP20]]
+; CHECK-NEXT:    store float [[ADD]], ptr [[A]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %1 = load ptr, ptr %a, align 8
+  %arrayidx = getelementptr i8, ptr %1, i64 84
+  %2 = load float, ptr %arrayidx, align 4
+  %3 = tail call float @llvm.fmuladd.f32(float %2, float 0.000000e+00, float 0.000000e+00)
+  %arrayidx1 = getelementptr i8, ptr %1, i64 28
+  %4 = load float, ptr %arrayidx1, align 4
+  %5 = tail call float @llvm.fmuladd.f32(float %4, float 0.000000e+00, float %3)
+  %arrayidx2 = getelementptr i8, ptr %1, i64 8
+  %6 = load float, ptr %arrayidx2, align 4
+  %7 = tail call float @llvm.fmuladd.f32(float %6, float 0.000000e+00, float 0.000000e+00)
+  %arrayidx3 = getelementptr i8, ptr %1, i64 68
+  %8 = load float, ptr %arrayidx3, align 4
+  %9 = tail call float @llvm.fmuladd.f32(float %8, float 0.000000e+00, float %5)
+  %arrayidx4 = getelementptr i8, ptr %1, i64 88
+  %10 = load float, ptr %arrayidx4, align 4
+  %11 = tail call float @llvm.fmuladd.f32(float %10, float 0.000000e+00, float %7)
+  %arrayidx5 = getelementptr i8, ptr %1, i64 92
+  %12 = load float, ptr %arrayidx5, align 4
+  %13 = tail call float @llvm.fmuladd.f32(float %12, float 0.000000e+00, float %11)
+  %14 = tail call float @llvm.fmuladd.f32(float %0, float 0.000000e+00, float %9)
+  %arrayidx6 = getelementptr i8, ptr %1, i64 96
+  %15 = load float, ptr %arrayidx6, align 4
+  %16 = tail call float @llvm.fmuladd.f32(float %15, float 0.000000e+00, float %13)
+  %arrayidx7 = getelementptr i8, ptr %1, i64 80
+  %17 = load float, ptr %arrayidx7, align 4
+  %18 = tail call float @llvm.fmuladd.f32(float %0, float %17, float %16)
+  %arrayidx8 = getelementptr i8, ptr %1, i64 100
+  %19 = load float, ptr %arrayidx8, align 4
+  %20 = tail call float @llvm.fmuladd.f32(float %19, float 0.000000e+00, float %14)
+  %add = fadd float %18, %20
+  store float %add, ptr %a, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/VectorCombine/X86/pr114901.ll b/llvm/test/Transforms/VectorCombine/X86/pr114901.ll
new file mode 100644
index 0000000..4daa569
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/pr114901.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=AVX
+
+; PR114901 - ensure that the ASHR node doesn't commute the operands.
+define i1 @PR114901(<4 x i32> %a) {
+; SSE-LABEL: define i1 @PR114901(
+; SSE-SAME: <4 x i32> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; SSE-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[A]], i32 1
+; SSE-NEXT:    [[E3:%.*]] = extractelement <4 x i32> [[A]], i32 3
+; SSE-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[E1]], -8
+; SSE-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[E3]], 42
+; SSE-NEXT:    [[R:%.*]] = ashr i1 [[CMP3]], [[CMP1]]
+; SSE-NEXT:    ret i1 [[R]]
+;
+; AVX-LABEL: define i1 @PR114901(
+; AVX-SAME: <4 x i32> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[A]], i32 1
+; AVX-NEXT:    [[E3:%.*]] = extractelement <4 x i32> [[A]], i32 3
+; AVX-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[E1]], -8
+; AVX-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[E3]], 42
+; AVX-NEXT:    [[R:%.*]] = ashr i1 [[CMP3]], [[CMP1]]
+; AVX-NEXT:    ret i1 [[R]]
+;
+  %e1 = extractelement <4 x i32> %a, i32 1
+  %e3 = extractelement <4 x i32> %a, i32 3
+  %cmp1 = icmp sgt i32 %e1, 4294967288
+  %cmp3 = icmp sgt i32 %e3, 42
+  %r = ashr i1 %cmp3, %cmp1
+  ret i1 %r
+}
diff --git a/llvm/test/Verifier/recursive-struct-param.ll b/llvm/test/Verifier/recursive-struct-param.ll
deleted file mode 100644
index 19497f6..0000000
--- a/llvm/test/Verifier/recursive-struct-param.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: opt -passes=verify < %s
-
-%struct.__sFILE = type { %struct.__sFILE }
-
-@.str = private unnamed_addr constant [13 x i8] c"Hello world\0A\00", align 1
-
-; Function Attrs: nounwind ssp
-define void @test(ptr %stream, ptr %str) {
-  %fputs = call i32 @fputs(ptr %str, ptr %stream)
-  ret void
-}
-
-; Function Attrs: nounwind
-declare i32 @fputs(ptr nocapture, ptr nocapture)
-
diff --git a/llvm/test/Verifier/recursive-type-1.ll b/llvm/test/Verifier/recursive-type-1.ll
deleted file mode 100644
index 4a39957..0000000
--- a/llvm/test/Verifier/recursive-type-1.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
-
-%rt2 = type { i32, { i8, %rt2, i8 }, i32 }
-
-define i32 @main() nounwind {
-entry:
-  ; Check that recursive types trigger an error instead of segfaulting, when
-  ; the recursion isn't through a pointer to the type.
-  ; CHECK: Cannot allocate unsized type
-  %0 = alloca %rt2
-  ret i32 0
-}
diff --git a/llvm/test/Verifier/recursive-type-2.ll b/llvm/test/Verifier/recursive-type-2.ll
deleted file mode 100644
index 5f2f66f..0000000
--- a/llvm/test/Verifier/recursive-type-2.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
-
-%rt1 = type { i32, { i8, %rt2, i8 }, i32 }
-%rt2 = type { i64, { i6, %rt3 } }
-%rt3 = type { %rt1 }
-
-define i32 @main() nounwind {
-entry:
-  ; Check that mutually recursive types trigger an error instead of segfaulting,
-  ; when the recursion isn't through a pointer to the type.
-  ; CHECK: Cannot allocate unsized type
-  %0 = alloca %rt2
-  ret i32 0
-}
diff --git a/llvm/test/Verifier/recursive-type-load.ll b/llvm/test/Verifier/recursive-type-load.ll
deleted file mode 100644
index 62a094d..0000000
--- a/llvm/test/Verifier/recursive-type-load.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
-
-%rt2 = type { i32, { i8, %rt2, i8 }, i32 }
-
-define i32 @f(ptr %p) nounwind {
-entry:
-  ; Check that recursive types trigger an error instead of segfaulting, when
-  ; the recursion isn't through a pointer to the type.
-  ; CHECK: loading unsized types is not allowed
-  %0 = load %rt2, ptr %p
-  ret i32 %0
-}
diff --git a/llvm/test/Verifier/recursive-type-store.ll b/llvm/test/Verifier/recursive-type-store.ll
deleted file mode 100644
index ed815f8..0000000
--- a/llvm/test/Verifier/recursive-type-store.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
-
-%rt2 = type { i32, { i8, %rt2, i8 }, i32 }
-
-define void @f(%rt2 %r, ptr %p) nounwind {
-entry:
-  ; Check that recursive types trigger an error instead of segfaulting, when
-  ; the recursion isn't through a pointer to the type.
-  ; CHECK: storing unsized types is not allowed
-  store %rt2 %r, ptr %p
-  ret void
-}
diff --git a/llvm/tools/bugpoint/CrashDebugger.cpp b/llvm/tools/bugpoint/CrashDebugger.cpp
index e1a732f4..7dbbf73 100644
--- a/llvm/tools/bugpoint/CrashDebugger.cpp
+++ b/llvm/tools/bugpoint/CrashDebugger.cpp
@@ -245,7 +245,7 @@ static void RemoveFunctionReferences(Module *M, const char *Name) {
   auto *NewValElemTy = OldUsedVal->getType()->getElementType();
   auto *NewValTy = ArrayType::get(NewValElemTy, Used.size());
   auto *NewUsedVal = ConstantArray::get(NewValTy, Used);
-  UsedVar->mutateType(NewUsedVal->getType()->getPointerTo());
+  UsedVar->mutateType(PointerType::getUnqual(M->getContext()));
   UsedVar->setInitializer(NewUsedVal);
 }
 
diff --git a/llvm/unittests/ADT/FunctionRefTest.cpp b/llvm/unittests/ADT/FunctionRefTest.cpp
index b181933..6ad4040 100644
--- a/llvm/unittests/ADT/FunctionRefTest.cpp
+++ b/llvm/unittests/ADT/FunctionRefTest.cpp
@@ -60,11 +60,11 @@ TEST(FunctionRefTest, SFINAE) {
 }
 
 TEST(FunctionRefTest, Equality) {
-  function_ref<int()> X = [] { return 1; };
+  const auto Lambda = []() { return 0; };
+  function_ref<int()> X = Lambda;
   function_ref<int()> Y = X;
   EXPECT_EQ(X, Y);
 
-  const auto Lambda = []() { return 0; };
   function_ref<int()> A(Lambda), B(Lambda);
   EXPECT_EQ(A, B);
 }
diff --git a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
index 982d00c..c5e1d45 100644
--- a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
+++ b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
@@ -59,7 +59,7 @@ protected:
 
 // Check that we don't accept egregiously incorrect prototypes.
 TEST_F(TargetLibraryInfoTest, InvalidProto) {
-  parseAssembly("%foo = type { %foo }\n");
+  parseAssembly("%foo = type opaque\n");
 
   auto *StructTy = StructType::getTypeByName(Context, "foo");
   auto *InvalidFTy = FunctionType::get(StructTy, /*isVarArg=*/false);
diff --git a/llvm/unittests/Support/KnownBitsTest.cpp b/llvm/unittests/Support/KnownBitsTest.cpp
index b16368d..ce0bf86 100644
--- a/llvm/unittests/Support/KnownBitsTest.cpp
+++ b/llvm/unittests/Support/KnownBitsTest.cpp
@@ -383,26 +383,22 @@ TEST(KnownBitsTest, BinaryExhaustive) {
       "sadd_sat", KnownBits::sadd_sat,
       [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
         return N1.sadd_sat(N2);
-      },
-      /*CheckOptimality=*/false);
+      });
   testBinaryOpExhaustive(
       "uadd_sat", KnownBits::uadd_sat,
       [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
         return N1.uadd_sat(N2);
-      },
-      /*CheckOptimality=*/false);
+      });
   testBinaryOpExhaustive(
       "ssub_sat", KnownBits::ssub_sat,
       [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
         return N1.ssub_sat(N2);
-      },
-      /*CheckOptimality=*/false);
+      });
   testBinaryOpExhaustive(
       "usub_sat", KnownBits::usub_sat,
       [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
         return N1.usub_sat(N2);
-      },
-      /*CheckOptimality=*/false);
+      });
   testBinaryOpExhaustive(
       "shl",
       [](const KnownBits &Known1, const KnownBits &Known2) {
diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
index e9480d3..92fd6e9 100644
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
@@ -1113,4 +1113,73 @@ def AffineDelinearizeIndexOp : Affine_Op<"delinearize_index",
   let hasCanonicalizer = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// AffineLinearizeIndexOp
+//===----------------------------------------------------------------------===//
+def AffineLinearizeIndexOp : Affine_Op<"linearize_index",
+    [Pure, AttrSizedOperandSegments]> {
+  let summary = "linearize an index";
+  let description = [{
+    The `affine.linearize_index` operation takes a sequence of index values and a
+    basis of the same length and linearizes the indices using that basis.
+
+    That is, for indices `%idx_1` through `%idx_N` and basis elements `b_1` through `b_N`,
+    it computes
+
+    ```
+    sum(i = 1 to N) %idx_i * product(j = i + 1 to N) B_j
+    ```
+
+    If the `disjoint` property is present, this is an optimization hint that,
+    for all `i`, `0 <= %idx_i < B_i` - that is, no index affects any other index,
+    except that `%idx_0` may be negative to make the index as a whole negative.
+
+    Note that the outputs of `affine.delinearize_index` are, by definition, `disjoint`.
+
+    Example:
+
+    ```mlir
+    %linear_index = affine.linearize_index [%index_0, %index_1, %index_2] (2, 3, 5) : index
+    ```
+
+    In the above example, `%linear_index` conceptually holds the following:
+
+    ```mlir
+    #map = affine_map<()[s0, s1, s2] -> (s0 * 15 + s1 * 5 + s2)>
+    %linear_index = affine.apply #map()[%index_0, %index_1, %index_2]
+    ```
+  }];
+
+  let arguments = (ins Variadic<Index>:$multi_index,
+    Variadic<Index>:$dynamic_basis,
+    DenseI64ArrayAttr:$static_basis,
+    UnitProperty:$disjoint);
+  let results = (outs Index:$linear_index);
+
+  let assemblyFormat = [{
+    (`disjoint` $disjoint^)? ` `
+    `[` $multi_index `]` `by` ` `
+    custom<DynamicIndexList>($dynamic_basis, $static_basis, "::mlir::AsmParser::Delimiter::Paren")
+    attr-dict `:` type($linear_index)
+  }];
+
+  let builders = [
+    OpBuilder<(ins "ValueRange":$multi_index, "ValueRange":$basis, CArg<"bool", "false">:$disjoint)>,
+    OpBuilder<(ins "ValueRange":$multi_index, "ArrayRef<OpFoldResult>":$basis, CArg<"bool", "false">:$disjoint)>,
+    OpBuilder<(ins "ValueRange":$multi_index, "ArrayRef<int64_t>":$basis, CArg<"bool", "false">:$disjoint)>
+  ];
+
+  let extraClassDeclaration = [{
+    /// Return a vector with all the static and dynamic basis values.
+    SmallVector<OpFoldResult> getMixedBasis() {
+      OpBuilder builder(getContext());
+      return ::mlir::getMixedValues(getStaticBasis(), getDynamicBasis(), builder);
+    }
+
+  }];
+
+  let hasVerifier = 1;
+  let hasCanonicalizer = 1;
+}
+
 #endif // AFFINE_OPS
diff --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h
index a2bf923..0e98223 100644
--- a/mlir/include/mlir/Dialect/Affine/Utils.h
+++ b/mlir/include/mlir/Dialect/Affine/Utils.h
@@ -315,12 +315,17 @@ FailureOr<SmallVector<Value>> delinearizeIndex(OpBuilder &b, Location loc,
 FailureOr<SmallVector<Value>> delinearizeIndex(OpBuilder &b, Location loc,
                                                Value linearIndex,
                                                ArrayRef<OpFoldResult> basis);
+
 // Generate IR that extracts the linear index from a multi-index according to
 // a basis/shape.
 OpFoldResult linearizeIndex(ArrayRef<OpFoldResult> multiIndex,
                             ArrayRef<OpFoldResult> basis,
                             ImplicitLocOpBuilder &builder);
 
+OpFoldResult linearizeIndex(OpBuilder &builder, Location loc,
+                            ArrayRef<OpFoldResult> multiIndex,
+                            ArrayRef<OpFoldResult> basis);
+
 /// Ensure that all operations that could be executed after `start`
 /// (noninclusive) and prior to `memOp` (e.g. on a control flow/op path
 /// between the operations) do not have the potential memory effect
diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
index e866ac5..58dce89 100644
--- a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
@@ -77,9 +77,10 @@ void populateUnsignedWhenEquivalentPatterns(RewritePatternSet &patterns,
 /// Create a pass which do optimizations based on integer range analysis.
 std::unique_ptr<Pass> createIntRangeOptimizationsPass();
 
-/// Add patterns for integer bitwidth narrowing.
-void populateArithIntNarrowingPatterns(RewritePatternSet &patterns,
-                                       const ArithIntNarrowingOptions &options);
+/// Add patterns for int range based narrowing.
+void populateIntRangeNarrowingPatterns(RewritePatternSet &patterns,
+                                       DataFlowSolver &solver,
+                                       ArrayRef<unsigned> bitwidthsSupported);
 
 //===----------------------------------------------------------------------===//
 // Registration
diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td
index 1517f71..1d37314 100644
--- a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td
@@ -50,6 +50,28 @@ def ArithIntRangeOpts : Pass<"int-range-optimizations"> {
   ];
 }
 
+def ArithIntRangeNarrowing : Pass<"arith-int-range-narrowing"> {
+  let summary = "Reduce integer operations bitwidth based on integer range analysis";
+  let description = [{
+    This pass runs integer range analysis and tries to narrow arith ops to the
+    specified bitwidth based on its results.
+
+    `bitwidthsSupported` assumed to be not wider than `index` type.
+    TODO: get index width from DLTI.
+  }];
+
+  let options = [
+    ListOption<"bitwidthsSupported", "int-bitwidths-supported", "unsigned",
+               "Integer bitwidths supported">,
+  ];
+
+  // Explicitly depend on "arith" because this pass could create operations in
+  // `arith` out of thin air in some cases.
+  let dependentDialects = [
+    "::mlir::arith::ArithDialect"
+  ];
+}
+
 def ArithEmulateUnsupportedFloats : Pass<"arith-emulate-unsupported-floats"> {
   let summary = "Emulate operations on unsupported floats with extf/truncf";
   let description = [{
@@ -92,18 +114,4 @@ def ArithEmulateWideInt : Pass<"arith-emulate-wide-int"> {
   let dependentDialects = ["vector::VectorDialect"];
 }
 
-def ArithIntNarrowing : Pass<"arith-int-narrowing"> {
-  let summary = "Reduce integer operation bitwidth";
-  let description = [{
-    Reduce bitwidths of integer types used in arith operations. This pass
-    prefers the narrowest available integer bitwidths that are guaranteed to
-    produce the same results.
-  }];
-  let dependentDialects = ["vector::VectorDialect"];
-  let options = [
-    ListOption<"bitwidthsSupported", "int-bitwidths-supported", "unsigned",
-               "Integer bitwidths supported">,
-  ];
- }
-
 #endif // MLIR_DIALECT_ARITH_TRANSFORMS_PASSES
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
index 72abb5b..ab9a48f 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
@@ -163,7 +163,7 @@ struct BufferResultsToOutParamsOpts {
 
   // Filter function; returns true if the function should be converted.
   // Defaults to true, i.e. all functions are converted.
-  llvm::function_ref<bool(func::FuncOp *)> filterFn = [](func::FuncOp *func) {
+  std::function<bool(func::FuncOp *)> filterFn = [](func::FuncOp *func) {
     return true;
   };
 
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index e305e2f..8d7e274 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -347,6 +347,24 @@ def OpenACC_DataBoundsOp : OpenACC_Op<"bounds",
   }];
 
   let hasVerifier = 1;
+
+  let builders = [
+    OpBuilder<(ins "::mlir::Value":$extent), [{
+        build($_builder, $_state,
+          ::mlir::acc::DataBoundsType::get($_builder.getContext()),
+          /*lowerbound=*/{}, /*upperbound=*/{}, extent,
+          /*stride=*/{}, /*strideInBytes=*/nullptr, /*startIdx=*/{});
+      }]
+    >,
+    OpBuilder<(ins "::mlir::Value":$lowerbound,
+                   "::mlir::Value":$upperbound), [{
+        build($_builder, $_state,
+          ::mlir::acc::DataBoundsType::get($_builder.getContext()),
+          lowerbound, upperbound, /*extent=*/{},
+          /*stride=*/{}, /*strideInBytes=*/nullptr, /*startIdx=*/{});
+      }]
+    >
+  ];
 }
 
 // Data entry operation does not refer to OpenACC spec terminology, but to
@@ -450,6 +468,33 @@ class OpenACC_DataEntryOp<string mnemonic, string clause, string extraDescriptio
   }];
 
   let hasVerifier = 1;
+
+  let builders = [
+    OpBuilder<(ins "::mlir::Value":$varPtr,
+      "bool":$structured,
+      "bool":$implicit,
+      CArg<"::mlir::ValueRange", "{}">:$bounds), [{
+        build($_builder, $_state, varPtr.getType(), varPtr, /*varPtrPtr=*/{},
+          bounds, /*asyncOperands=*/{}, /*asyncOperandsDeviceType=*/nullptr,
+          /*asyncOnly=*/nullptr, /*dataClause=*/nullptr,
+          /*structured=*/$_builder.getBoolAttr(structured),
+          /*implicit=*/$_builder.getBoolAttr(implicit), /*name=*/nullptr);
+      }]
+    >,
+    OpBuilder<(ins "::mlir::Value":$varPtr,
+      "bool":$structured,
+      "bool":$implicit,
+      "const ::llvm::Twine &":$name,
+      CArg<"::mlir::ValueRange", "{}">:$bounds), [{
+        build($_builder, $_state, varPtr.getType(), varPtr, /*varPtrPtr=*/{},
+          bounds, /*asyncOperands=*/{}, /*asyncOperandsDeviceType=*/nullptr,
+          /*asyncOnly=*/nullptr, /*dataClause=*/nullptr,
+          /*structured=*/$_builder.getBoolAttr(structured),
+          /*implicit=*/$_builder.getBoolAttr(implicit),
+          /*name=*/$_builder.getStringAttr(name));
+      }]
+    >
+  ];
 }
 
 //===----------------------------------------------------------------------===//
@@ -762,23 +807,13 @@ class OpenACC_DataExitOp<string mnemonic, string clause, string extraDescription
   let hasVerifier = 1;
 }
 
-//===----------------------------------------------------------------------===//
-// 2.7.8 copyout clause
-//===----------------------------------------------------------------------===//
-def OpenACC_CopyoutOp : OpenACC_DataExitOp<"copyout",
-    "mlir::acc::DataClause::acc_copyout",
-    "- `varPtr`: The address of variable to copy back to.",
-    [MemoryEffects<[MemRead<OpenACC_RuntimeCounters>,
-                    MemWrite<OpenACC_RuntimeCounters>]>],
-    (ins Arg<OpenACC_PointerLikeTypeInterface,"Address of device variable",[MemRead]>:$accPtr,
-         Arg<OpenACC_PointerLikeTypeInterface,"Address of variable",[MemWrite]>:$varPtr)> {
-  let summary = "Represents acc copyout semantics - reverse of copyin.";
-
-  let extraClassDeclaration = extraClassDeclarationBase # [{
-    /// Check if this is a copyout with zero modifier.
-    bool isCopyoutZero();
-  }];
-
+class OpenACC_DataExitOpWithVarPtr<string mnemonic, string clause> :
+    OpenACC_DataExitOp<mnemonic, clause,
+      "- `varPtr`: The address of variable to copy back to.",
+      [MemoryEffects<[MemRead<OpenACC_RuntimeCounters>,
+                      MemWrite<OpenACC_RuntimeCounters>]>],
+      (ins Arg<OpenACC_PointerLikeTypeInterface,"Address of device variable",[MemRead]>:$accPtr,
+           Arg<OpenACC_PointerLikeTypeInterface,"Address of variable",[MemWrite]>:$varPtr)> {
   let assemblyFormat = [{
     `accPtr` `(` $accPtr `:` type($accPtr) `)`
     (`bounds` `(` $bounds^ `)` )?
@@ -787,20 +822,42 @@ def OpenACC_CopyoutOp : OpenACC_DataExitOp<"copyout",
     `to` `varPtr` `(` $varPtr `:` type($varPtr) `)`
     attr-dict
   }];
+
+  let builders = [
+    OpBuilder<(ins "::mlir::Value":$accPtr,
+      "::mlir::Value":$varPtr,
+      "bool":$structured,
+      "bool":$implicit,
+      CArg<"::mlir::ValueRange", "{}">:$bounds), [{
+        build($_builder, $_state, accPtr, varPtr,
+          bounds, /*asyncOperands=*/{}, /*asyncOperandsDeviceType=*/nullptr,
+          /*asyncOnly=*/nullptr, /*dataClause=*/nullptr,
+          /*structured=*/$_builder.getBoolAttr(structured),
+          /*implicit=*/$_builder.getBoolAttr(implicit), /*name=*/nullptr);
+      }]
+    >,
+    OpBuilder<(ins "::mlir::Value":$accPtr,
+      "::mlir::Value":$varPtr,
+      "bool":$structured,
+      "bool":$implicit,
+      "const ::llvm::Twine &":$name,
+      CArg<"::mlir::ValueRange", "{}">:$bounds), [{
+        build($_builder, $_state, accPtr, varPtr,
+          bounds, /*asyncOperands=*/{}, /*asyncOperandsDeviceType=*/nullptr,
+          /*asyncOnly=*/nullptr, /*dataClause=*/nullptr,
+          /*structured=*/$_builder.getBoolAttr(structured),
+          /*implicit=*/$_builder.getBoolAttr(implicit),
+          /*name=*/$_builder.getStringAttr(name));
+      }]
+    >
+  ];
 }
 
-//===----------------------------------------------------------------------===//
-// 2.7.11 delete clause
-//===----------------------------------------------------------------------===//
-def OpenACC_DeleteOp : OpenACC_DataExitOp<"delete",
-    "mlir::acc::DataClause::acc_delete", "",
-    [MemoryEffects<[MemRead<OpenACC_RuntimeCounters>,
+class OpenACC_DataExitOpNoVarPtr<string mnemonic, string clause> :
+    OpenACC_DataExitOp<mnemonic, clause, "",
+      [MemoryEffects<[MemRead<OpenACC_RuntimeCounters>,
                     MemWrite<OpenACC_RuntimeCounters>]>],
-    (ins Arg<OpenACC_PointerLikeTypeInterface,"Address of device variable",[MemRead]>:$accPtr)> {
-  let summary = "Represents acc delete semantics - reverse of create.";
-
-  let extraClassDeclaration = extraClassDeclarationBase;
-
+      (ins Arg<OpenACC_PointerLikeTypeInterface,"Address of device variable",[MemRead]>:$accPtr)> {
   let assemblyFormat = [{
     `accPtr` `(` $accPtr `:` type($accPtr) `)`
     (`bounds` `(` $bounds^ `)` )?
@@ -808,39 +865,71 @@ def OpenACC_DeleteOp : OpenACC_DataExitOp<"delete",
             type($asyncOperands), $asyncOperandsDeviceType)^ `)`)?
     attr-dict
   }];
+
+  let builders = [
+    OpBuilder<(ins "::mlir::Value":$accPtr,
+      "bool":$structured,
+      "bool":$implicit,
+      CArg<"::mlir::ValueRange", "{}">:$bounds), [{
+        build($_builder, $_state, accPtr,
+          bounds, /*asyncOperands=*/{}, /*asyncOperandsDeviceType=*/nullptr,
+          /*asyncOnly=*/nullptr, /*dataClause=*/nullptr,
+          /*structured=*/$_builder.getBoolAttr(structured),
+          /*implicit=*/$_builder.getBoolAttr(implicit), /*name=*/nullptr);
+      }]
+    >,
+    OpBuilder<(ins "::mlir::Value":$accPtr,
+      "bool":$structured,
+      "bool":$implicit,
+      "const ::llvm::Twine &":$name,
+      CArg<"::mlir::ValueRange", "{}">:$bounds), [{
+        build($_builder, $_state, accPtr,
+          bounds, /*asyncOperands=*/{}, /*asyncOperandsDeviceType=*/nullptr,
+          /*asyncOnly=*/nullptr, /*dataClause=*/nullptr,
+          /*structured=*/$_builder.getBoolAttr(structured),
+          /*implicit=*/$_builder.getBoolAttr(implicit),
+          /*name=*/$_builder.getStringAttr(name));
+      }]
+    >
+  ];
 }
 
 //===----------------------------------------------------------------------===//
-// 2.7.13 detach clause
+// 2.7.8 copyout clause
 //===----------------------------------------------------------------------===//
-def OpenACC_DetachOp : OpenACC_DataExitOp<"detach",
-    "mlir::acc::DataClause::acc_detach", "",
-    [MemoryEffects<[MemRead<OpenACC_RuntimeCounters>,
-                    MemWrite<OpenACC_RuntimeCounters>]>],
-    (ins Arg<OpenACC_PointerLikeTypeInterface,"Address of device variable",[MemRead]>:$accPtr)> {
-  let summary = "Represents acc detach semantics - reverse of attach.";
+def OpenACC_CopyoutOp : OpenACC_DataExitOpWithVarPtr<"copyout",
+    "mlir::acc::DataClause::acc_copyout"> {
+  let summary = "Represents acc copyout semantics - reverse of copyin.";
 
+  let extraClassDeclaration = extraClassDeclarationBase # [{
+    /// Check if this is a copyout with zero modifier.
+    bool isCopyoutZero();
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// 2.7.11 delete clause
+//===----------------------------------------------------------------------===//
+def OpenACC_DeleteOp : OpenACC_DataExitOpNoVarPtr<"delete",
+    "mlir::acc::DataClause::acc_delete"> {
+  let summary = "Represents acc delete semantics - reverse of create.";
   let extraClassDeclaration = extraClassDeclarationBase;
+}
 
-  let assemblyFormat = [{
-    `accPtr` `(` $accPtr `:` type($accPtr) `)`
-    (`bounds` `(` $bounds^ `)` )?
-    (`async` `(` custom<DeviceTypeOperands>($asyncOperands,
-            type($asyncOperands), $asyncOperandsDeviceType)^ `)`)?
-    attr-dict
-  }];
+//===----------------------------------------------------------------------===//
+// 2.7.13 detach clause
+//===----------------------------------------------------------------------===//
+def OpenACC_DetachOp : OpenACC_DataExitOpNoVarPtr<"detach",
+    "mlir::acc::DataClause::acc_detach"> {
+  let summary = "Represents acc detach semantics - reverse of attach.";
+  let extraClassDeclaration = extraClassDeclarationBase;
 }
 
 //===----------------------------------------------------------------------===//
 // 2.14.4 host clause
 //===----------------------------------------------------------------------===//
-def OpenACC_UpdateHostOp : OpenACC_DataExitOp<"update_host",
-    "mlir::acc::DataClause::acc_update_host",
-    "- `varPtr`: The address of variable to copy back to.",
-    [MemoryEffects<[MemRead<OpenACC_RuntimeCounters>,
-                    MemWrite<OpenACC_RuntimeCounters>]>],
-    (ins Arg<OpenACC_PointerLikeTypeInterface,"Address of device variable",[MemRead]>:$accPtr,
-         Arg<OpenACC_PointerLikeTypeInterface,"Address of variable",[MemWrite]>:$varPtr)> {
+def OpenACC_UpdateHostOp : OpenACC_DataExitOpWithVarPtr<"update_host",
+    "mlir::acc::DataClause::acc_update_host"> {
   let summary = "Represents acc update host semantics.";
   let extraClassDeclaration = extraClassDeclarationBase # [{
     /// Check if this is an acc update self.
@@ -848,15 +937,6 @@ def OpenACC_UpdateHostOp : OpenACC_DataExitOp<"update_host",
       return getDataClause() == acc::DataClause::acc_update_self;
     }
   }];
-
-  let assemblyFormat = [{
-    `accPtr` `(` $accPtr `:` type($accPtr) `)`
-    (`bounds` `(` $bounds^ `)` )?
-    (`async` `(` custom<DeviceTypeOperands>($asyncOperands,
-            type($asyncOperands), $asyncOperandsDeviceType)^ `)`)?
-    `to` `varPtr` `(` $varPtr `:` type($varPtr) `)`
-    attr-dict
-  }];
 }
 
 //===----------------------------------------------------------------------===//
@@ -1958,8 +2038,7 @@ def OpenACC_YieldOp : OpenACC_Op<"yield", [Pure, ReturnLike, Terminator,
 // 2.12 atomic construct
 //===----------------------------------------------------------------------===//
 
-def AtomicReadOp : OpenACC_Op<"atomic.read", [AllTypesMatch<["x", "v"]>,
-                                              AtomicReadOpInterface]> {
+def AtomicReadOp : OpenACC_Op<"atomic.read", [AtomicReadOpInterface]> {
 
   let summary = "performs an atomic read";
 
@@ -1975,7 +2054,7 @@ def AtomicReadOp : OpenACC_Op<"atomic.read", [AllTypesMatch<["x", "v"]>,
                        TypeAttr:$element_type);
   let assemblyFormat = [{
     $v `=` $x
-    `:` type($x) `,` $element_type attr-dict
+    `:` type($v) `,` type($x) `,` $element_type attr-dict
   }];
   let hasVerifier = 1;
 }
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 626539c..5fd8184 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -1286,7 +1286,7 @@ def TaskwaitOp : OpenMP_Op<"taskwait", clauses = [
 // two-step process.
 
 def AtomicReadOp : OpenMP_Op<"atomic.read", traits = [
-    AllTypesMatch<["x", "v"]>, AtomicReadOpInterface
+    AtomicReadOpInterface
   ], clauses = [
     OpenMP_HintClause, OpenMP_MemoryOrderClause
   ]> {
@@ -1304,7 +1304,8 @@ def AtomicReadOp : OpenMP_Op<"atomic.read", traits = [
 
   // Override clause-based assemblyFormat.
   let assemblyFormat = "$v `=` $x" # clausesReqAssemblyFormat # " oilist(" #
-    clausesOptAssemblyFormat # ") `:` type($x) `,` $element_type attr-dict";
+    clausesOptAssemblyFormat #
+    ") `:` type($v) `,` type($x) `,` $element_type attr-dict";
 
   let extraClassDeclaration = [{
     /// The number of variable operands.
diff --git a/mlir/lib/Bindings/Python/IRAttributes.cpp b/mlir/lib/Bindings/Python/IRAttributes.cpp
index c8883c0..ead81a7 100644
--- a/mlir/lib/Bindings/Python/IRAttributes.cpp
+++ b/mlir/lib/Bindings/Python/IRAttributes.cpp
@@ -13,7 +13,6 @@
 #include "IRModule.h"
 
 #include "PybindUtils.h"
-#include <pybind11/numpy.h>
 
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/Support/raw_ostream.h"
@@ -758,10 +757,103 @@ public:
       throw py::error_already_set();
     }
     auto freeBuffer = llvm::make_scope_exit([&]() { PyBuffer_Release(&view); });
+    SmallVector<int64_t> shape;
+    if (explicitShape) {
+      shape.append(explicitShape->begin(), explicitShape->end());
+    } else {
+      shape.append(view.shape, view.shape + view.ndim);
+    }
 
+    MlirAttribute encodingAttr = mlirAttributeGetNull();
     MlirContext context = contextWrapper->get();
-    MlirAttribute attr = getAttributeFromBuffer(view, signless, explicitType,
-                                                explicitShape, context);
+
+    // Detect format codes that are suitable for bulk loading. This includes
+    // all byte aligned integer and floating point types up to 8 bytes.
+    // Notably, this excludes, bool (which needs to be bit-packed) and
+    // other exotics which do not have a direct representation in the buffer
+    // protocol (i.e. complex, etc).
+    std::optional<MlirType> bulkLoadElementType;
+    if (explicitType) {
+      bulkLoadElementType = *explicitType;
+    } else {
+      std::string_view format(view.format);
+      if (format == "f") {
+        // f32
+        assert(view.itemsize == 4 && "mismatched array itemsize");
+        bulkLoadElementType = mlirF32TypeGet(context);
+      } else if (format == "d") {
+        // f64
+        assert(view.itemsize == 8 && "mismatched array itemsize");
+        bulkLoadElementType = mlirF64TypeGet(context);
+      } else if (format == "e") {
+        // f16
+        assert(view.itemsize == 2 && "mismatched array itemsize");
+        bulkLoadElementType = mlirF16TypeGet(context);
+      } else if (isSignedIntegerFormat(format)) {
+        if (view.itemsize == 4) {
+          // i32
+          bulkLoadElementType = signless
+                                    ? mlirIntegerTypeGet(context, 32)
+                                    : mlirIntegerTypeSignedGet(context, 32);
+        } else if (view.itemsize == 8) {
+          // i64
+          bulkLoadElementType = signless
+                                    ? mlirIntegerTypeGet(context, 64)
+                                    : mlirIntegerTypeSignedGet(context, 64);
+        } else if (view.itemsize == 1) {
+          // i8
+          bulkLoadElementType = signless ? mlirIntegerTypeGet(context, 8)
+                                         : mlirIntegerTypeSignedGet(context, 8);
+        } else if (view.itemsize == 2) {
+          // i16
+          bulkLoadElementType = signless
+                                    ? mlirIntegerTypeGet(context, 16)
+                                    : mlirIntegerTypeSignedGet(context, 16);
+        }
+      } else if (isUnsignedIntegerFormat(format)) {
+        if (view.itemsize == 4) {
+          // unsigned i32
+          bulkLoadElementType = signless
+                                    ? mlirIntegerTypeGet(context, 32)
+                                    : mlirIntegerTypeUnsignedGet(context, 32);
+        } else if (view.itemsize == 8) {
+          // unsigned i64
+          bulkLoadElementType = signless
+                                    ? mlirIntegerTypeGet(context, 64)
+                                    : mlirIntegerTypeUnsignedGet(context, 64);
+        } else if (view.itemsize == 1) {
+          // i8
+          bulkLoadElementType = signless
+                                    ? mlirIntegerTypeGet(context, 8)
+                                    : mlirIntegerTypeUnsignedGet(context, 8);
+        } else if (view.itemsize == 2) {
+          // i16
+          bulkLoadElementType = signless
+                                    ? mlirIntegerTypeGet(context, 16)
+                                    : mlirIntegerTypeUnsignedGet(context, 16);
+        }
+      }
+      if (!bulkLoadElementType) {
+        throw std::invalid_argument(
+            std::string("unimplemented array format conversion from format: ") +
+            std::string(format));
+      }
+    }
+
+    MlirType shapedType;
+    if (mlirTypeIsAShaped(*bulkLoadElementType)) {
+      if (explicitShape) {
+        throw std::invalid_argument("Shape can only be specified explicitly "
+                                    "when the type is not a shaped type.");
+      }
+      shapedType = *bulkLoadElementType;
+    } else {
+      shapedType = mlirRankedTensorTypeGet(shape.size(), shape.data(),
+                                           *bulkLoadElementType, encodingAttr);
+    }
+    size_t rawBufferSize = view.len;
+    MlirAttribute attr =
+        mlirDenseElementsAttrRawBufferGet(shapedType, rawBufferSize, view.buf);
     if (mlirAttributeIsNull(attr)) {
       throw std::invalid_argument(
           "DenseElementsAttr could not be constructed from the given buffer. "
@@ -871,13 +963,6 @@ public:
         // unsigned i16
         return bufferInfo<uint16_t>(shapedType);
       }
-    } else if (mlirTypeIsAInteger(elementType) &&
-               mlirIntegerTypeGetWidth(elementType) == 1) {
-      // i1 / bool
-      // We can not send the buffer directly back to Python, because the i1
-      // values are bitpacked within MLIR. We call numpy's unpackbits function
-      // to convert the bytes.
-      return getBooleanBufferFromBitpackedAttribute();
     }
 
     // TODO: Currently crashes the program.
@@ -931,183 +1016,14 @@ private:
            code == 'q';
   }
 
-  static MlirType
-  getShapedType(std::optional<MlirType> bulkLoadElementType,
-                std::optional<std::vector<int64_t>> explicitShape,
-                Py_buffer &view) {
-    SmallVector<int64_t> shape;
-    if (explicitShape) {
-      shape.append(explicitShape->begin(), explicitShape->end());
-    } else {
-      shape.append(view.shape, view.shape + view.ndim);
-    }
-
-    if (mlirTypeIsAShaped(*bulkLoadElementType)) {
-      if (explicitShape) {
-        throw std::invalid_argument("Shape can only be specified explicitly "
-                                    "when the type is not a shaped type.");
-      }
-      return *bulkLoadElementType;
-    } else {
-      MlirAttribute encodingAttr = mlirAttributeGetNull();
-      return mlirRankedTensorTypeGet(shape.size(), shape.data(),
-                                     *bulkLoadElementType, encodingAttr);
-    }
-  }
-
-  static MlirAttribute getAttributeFromBuffer(
-      Py_buffer &view, bool signless, std::optional<PyType> explicitType,
-      std::optional<std::vector<int64_t>> explicitShape, MlirContext &context) {
-    // Detect format codes that are suitable for bulk loading. This includes
-    // all byte aligned integer and floating point types up to 8 bytes.
-    // Notably, this excludes exotics types which do not have a direct
-    // representation in the buffer protocol (i.e. complex, etc).
-    std::optional<MlirType> bulkLoadElementType;
-    if (explicitType) {
-      bulkLoadElementType = *explicitType;
-    } else {
-      std::string_view format(view.format);
-      if (format == "f") {
-        // f32
-        assert(view.itemsize == 4 && "mismatched array itemsize");
-        bulkLoadElementType = mlirF32TypeGet(context);
-      } else if (format == "d") {
-        // f64
-        assert(view.itemsize == 8 && "mismatched array itemsize");
-        bulkLoadElementType = mlirF64TypeGet(context);
-      } else if (format == "e") {
-        // f16
-        assert(view.itemsize == 2 && "mismatched array itemsize");
-        bulkLoadElementType = mlirF16TypeGet(context);
-      } else if (format == "?") {
-        // i1
-        // The i1 type needs to be bit-packed, so we will handle it seperately
-        return getBitpackedAttributeFromBooleanBuffer(view, explicitShape,
-                                                      context);
-      } else if (isSignedIntegerFormat(format)) {
-        if (view.itemsize == 4) {
-          // i32
-          bulkLoadElementType = signless
-                                    ? mlirIntegerTypeGet(context, 32)
-                                    : mlirIntegerTypeSignedGet(context, 32);
-        } else if (view.itemsize == 8) {
-          // i64
-          bulkLoadElementType = signless
-                                    ? mlirIntegerTypeGet(context, 64)
-                                    : mlirIntegerTypeSignedGet(context, 64);
-        } else if (view.itemsize == 1) {
-          // i8
-          bulkLoadElementType = signless ? mlirIntegerTypeGet(context, 8)
-                                         : mlirIntegerTypeSignedGet(context, 8);
-        } else if (view.itemsize == 2) {
-          // i16
-          bulkLoadElementType = signless
-                                    ? mlirIntegerTypeGet(context, 16)
-                                    : mlirIntegerTypeSignedGet(context, 16);
-        }
-      } else if (isUnsignedIntegerFormat(format)) {
-        if (view.itemsize == 4) {
-          // unsigned i32
-          bulkLoadElementType = signless
-                                    ? mlirIntegerTypeGet(context, 32)
-                                    : mlirIntegerTypeUnsignedGet(context, 32);
-        } else if (view.itemsize == 8) {
-          // unsigned i64
-          bulkLoadElementType = signless
-                                    ? mlirIntegerTypeGet(context, 64)
-                                    : mlirIntegerTypeUnsignedGet(context, 64);
-        } else if (view.itemsize == 1) {
-          // i8
-          bulkLoadElementType = signless
-                                    ? mlirIntegerTypeGet(context, 8)
-                                    : mlirIntegerTypeUnsignedGet(context, 8);
-        } else if (view.itemsize == 2) {
-          // i16
-          bulkLoadElementType = signless
-                                    ? mlirIntegerTypeGet(context, 16)
-                                    : mlirIntegerTypeUnsignedGet(context, 16);
-        }
-      }
-      if (!bulkLoadElementType) {
-        throw std::invalid_argument(
-            std::string("unimplemented array format conversion from format: ") +
-            std::string(format));
-      }
-    }
-
-    MlirType type = getShapedType(bulkLoadElementType, explicitShape, view);
-    return mlirDenseElementsAttrRawBufferGet(type, view.len, view.buf);
-  }
-
-  // There is a complication for boolean numpy arrays, as numpy represents them
-  // as 8 bits (1 byte) per boolean, whereas MLIR bitpacks them into 8 booleans
-  // per byte.
-  static MlirAttribute getBitpackedAttributeFromBooleanBuffer(
-      Py_buffer &view, std::optional<std::vector<int64_t>> explicitShape,
-      MlirContext &context) {
-    if (llvm::endianness::native != llvm::endianness::little) {
-      // Given we have no good way of testing the behavior on big-endian systems
-      // we will throw
-      throw py::type_error("Constructing a bit-packed MLIR attribute is "
-                           "unsupported on big-endian systems");
-    }
-
-    py::array_t<uint8_t> unpackedArray(view.len,
-                                       static_cast<uint8_t *>(view.buf));
-
-    py::module numpy = py::module::import("numpy");
-    py::object packbits_func = numpy.attr("packbits");
-    py::object packed_booleans =
-        packbits_func(unpackedArray, "bitorder"_a = "little");
-    py::buffer_info pythonBuffer = packed_booleans.cast<py::buffer>().request();
-
-    MlirType bitpackedType =
-        getShapedType(mlirIntegerTypeGet(context, 1), explicitShape, view);
-    return mlirDenseElementsAttrRawBufferGet(bitpackedType, pythonBuffer.size,
-                                             pythonBuffer.ptr);
-  }
-
-  // This does the opposite transformation of
-  // `getBitpackedAttributeFromBooleanBuffer`
-  py::buffer_info getBooleanBufferFromBitpackedAttribute() {
-    if (llvm::endianness::native != llvm::endianness::little) {
-      // Given we have no good way of testing the behavior on big-endian systems
-      // we will throw
-      throw py::type_error("Constructing a numpy array from a MLIR attribute "
-                           "is unsupported on big-endian systems");
-    }
-
-    int64_t numBooleans = mlirElementsAttrGetNumElements(*this);
-    int64_t numBitpackedBytes = llvm::divideCeil(numBooleans, 8);
-    uint8_t *bitpackedData = static_cast<uint8_t *>(
-        const_cast<void *>(mlirDenseElementsAttrGetRawData(*this)));
-    py::array_t<uint8_t> packedArray(numBitpackedBytes, bitpackedData);
-
-    py::module numpy = py::module::import("numpy");
-    py::object unpackbits_func = numpy.attr("unpackbits");
-    py::object unpacked_booleans =
-        unpackbits_func(packedArray, "bitorder"_a = "little");
-    py::buffer_info pythonBuffer =
-        unpacked_booleans.cast<py::buffer>().request();
-
-    MlirType shapedType = mlirAttributeGetType(*this);
-    return bufferInfo<bool>(shapedType, (bool *)pythonBuffer.ptr, "?");
-  }
-
   template <typename Type>
   py::buffer_info bufferInfo(MlirType shapedType,
                              const char *explicitFormat = nullptr) {
+    intptr_t rank = mlirShapedTypeGetRank(shapedType);
     // Prepare the data for the buffer_info.
-    // Buffer is configured for read-only access inside the `bufferInfo` call.
+    // Buffer is configured for read-only access below.
     Type *data = static_cast<Type *>(
         const_cast<void *>(mlirDenseElementsAttrGetRawData(*this)));
-    return bufferInfo<Type>(shapedType, data, explicitFormat);
-  }
-
-  template <typename Type>
-  py::buffer_info bufferInfo(MlirType shapedType, Type *data,
-                             const char *explicitFormat = nullptr) {
-    intptr_t rank = mlirShapedTypeGetRank(shapedType);
     // Prepare the shape for the buffer_info.
     SmallVector<intptr_t, 4> shape;
     for (intptr_t i = 0; i < rank; ++i)
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index f384f45..ca55c44 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -4605,23 +4605,14 @@ struct DropUnitExtentBasis
   }
 };
 
-/// Drop delinearization pattern related to loops in the following way
+/// Drop delinearization with a single basis element
 ///
-/// ```
-/// <loop>(%iv) = (%c0) to (%ub) step (%c1) {
-///   %0 = affine.delinearize_index %iv into (%ub) : index
-///   <some_use>(%0)
-/// }
-/// ```
-///
-/// can be canonicalized to
-///
-/// ```
-/// <loop>(%iv) = (%c0) to (%ub) step (%c1) {
-///   <some_use>(%iv)
-/// }
-/// ```
-struct DropDelinearizeOfSingleLoop
+/// By definition, `delinearize_index %linear into (%basis)` is
+/// `%linear floorDiv 1` (since `1` is the product of the basis elememts,
+/// ignoring the 0th one, and since there is no previous division we need
+/// to use the remainder of). Therefore, a single-element `delinearize`
+/// can be replaced by the underlying linear index.
+struct DropDelinearizeOneBasisElement
     : public OpRewritePattern<affine::AffineDelinearizeIndexOp> {
   using OpRewritePattern::OpRewritePattern;
 
@@ -4629,59 +4620,125 @@ struct DropDelinearizeOfSingleLoop
                                 PatternRewriter &rewriter) const override {
     if (delinearizeOp.getStaticBasis().size() != 1)
       return failure();
-    auto basis = delinearizeOp.getMixedBasis();
+    rewriter.replaceOp(delinearizeOp, delinearizeOp.getLinearIndex());
+    return success();
+  }
+};
 
-    // Check that the `linear_index` is an induction variable.
-    auto inductionVar = dyn_cast<BlockArgument>(delinearizeOp.getLinearIndex());
-    if (!inductionVar)
-      return failure();
+} // namespace
 
-    // Check that the parent is a `LoopLikeOpInterface`.
-    auto loopLikeOp = dyn_cast<LoopLikeOpInterface>(
-        inductionVar.getParentRegion()->getParentOp());
-    if (!loopLikeOp)
-      return failure();
+void affine::AffineDelinearizeIndexOp::getCanonicalizationPatterns(
+    RewritePatternSet &patterns, MLIRContext *context) {
+  patterns.insert<DropDelinearizeOneBasisElement, DropUnitExtentBasis>(context);
+}
 
-    // Check that loop is unit-rank and that the `linear_index` is the induction
-    // variable.
-    auto inductionVars = loopLikeOp.getLoopInductionVars();
-    if (!inductionVars || inductionVars->size() != 1 ||
-        inductionVars->front() != inductionVar) {
-      return rewriter.notifyMatchFailure(
-          delinearizeOp, "`linear_index` is not loop induction variable");
-    }
+//===----------------------------------------------------------------------===//
+// LinearizeIndexOp
+//===----------------------------------------------------------------------===//
 
-    // Check that the upper-bound is the basis.
-    auto upperBounds = loopLikeOp.getLoopUpperBounds();
-    if (!upperBounds || upperBounds->size() != 1 ||
-        upperBounds->front() != basis.front()) {
-      return rewriter.notifyMatchFailure(delinearizeOp,
-                                         "`basis` is not upper bound");
-    }
+void AffineLinearizeIndexOp::build(OpBuilder &odsBuilder,
+                                   OperationState &odsState,
+                                   ValueRange multiIndex, ValueRange basis,
+                                   bool disjoint) {
+  SmallVector<Value> dynamicBasis;
+  SmallVector<int64_t> staticBasis;
+  dispatchIndexOpFoldResults(getAsOpFoldResult(basis), dynamicBasis,
+                             staticBasis);
+  build(odsBuilder, odsState, multiIndex, dynamicBasis, staticBasis, disjoint);
+}
 
-    // Check that the lower bound is zero.
-    auto lowerBounds = loopLikeOp.getLoopLowerBounds();
-    if (!lowerBounds || lowerBounds->size() != 1 ||
-        !isZeroIndex(lowerBounds->front())) {
-      return rewriter.notifyMatchFailure(delinearizeOp,
-                                         "loop lower bound is not zero");
-    }
+void AffineLinearizeIndexOp::build(OpBuilder &odsBuilder,
+                                   OperationState &odsState,
+                                   ValueRange multiIndex,
+                                   ArrayRef<OpFoldResult> basis,
+                                   bool disjoint) {
+  SmallVector<Value> dynamicBasis;
+  SmallVector<int64_t> staticBasis;
+  dispatchIndexOpFoldResults(basis, dynamicBasis, staticBasis);
+  build(odsBuilder, odsState, multiIndex, dynamicBasis, staticBasis, disjoint);
+}
+
+void AffineLinearizeIndexOp::build(OpBuilder &odsBuilder,
+                                   OperationState &odsState,
+                                   ValueRange multiIndex,
+                                   ArrayRef<int64_t> basis, bool disjoint) {
+  build(odsBuilder, odsState, multiIndex, ValueRange{}, basis, disjoint);
+}
+
+LogicalResult AffineLinearizeIndexOp::verify() {
+  if (getStaticBasis().empty())
+    return emitOpError("basis should not be empty");
 
-    // Check that the step is one.
-    auto steps = loopLikeOp.getLoopSteps();
-    if (!steps || steps->size() != 1 || !isConstantIntValue(steps->front(), 1))
-      return rewriter.notifyMatchFailure(delinearizeOp, "loop step is not one");
+  if (getMultiIndex().size() != getStaticBasis().size())
+    return emitOpError("should be passed an index for each basis element");
 
-    rewriter.replaceOp(delinearizeOp, inductionVar);
+  auto dynamicMarkersCount =
+      llvm::count_if(getStaticBasis(), ShapedType::isDynamic);
+  if (static_cast<size_t>(dynamicMarkersCount) != getDynamicBasis().size())
+    return emitOpError(
+        "mismatch between dynamic and static basis (kDynamic marker but no "
+        "corresponding dynamic basis entry) -- this can only happen due to an "
+        "incorrect fold/rewrite");
+
+  return success();
+}
+
+namespace {
+/// Rewrite `affine.linearize_index disjoint [%...a, %x, %...b] by (%...c, 1,
+/// %...d)` to `affine.linearize_index disjoint [%...a, %...b] by (%...c,
+/// %...d)`.
+
+/// Note that `disjoint` is required here, because, without it, we could have
+/// `affine.linearize_index [%...a, %c64, %...b] by (%...c, 1, %...d)`
+/// is a valid operation where the `%c64` cannot be trivially dropped.
+///
+/// Alternatively, if `%x` in the above is a known constant 0, remove it even if
+/// the operation isn't asserted to be `disjoint`.
+struct DropLinearizeUnitComponentsIfDisjointOrZero final
+    : OpRewritePattern<affine::AffineLinearizeIndexOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(affine::AffineLinearizeIndexOp op,
+                                PatternRewriter &rewriter) const override {
+    size_t numIndices = op.getMultiIndex().size();
+    SmallVector<Value> newIndices;
+    newIndices.reserve(numIndices);
+    SmallVector<OpFoldResult> newBasis;
+    newBasis.reserve(numIndices);
+
+    SmallVector<OpFoldResult> basis = op.getMixedBasis();
+    for (auto [index, basisElem] : llvm::zip_equal(op.getMultiIndex(), basis)) {
+      std::optional<int64_t> basisEntry = getConstantIntValue(basisElem);
+      if (!basisEntry || *basisEntry != 1) {
+        newIndices.push_back(index);
+        newBasis.push_back(basisElem);
+        continue;
+      }
+
+      std::optional<int64_t> indexValue = getConstantIntValue(index);
+      if (!op.getDisjoint() && (!indexValue || *indexValue != 0)) {
+        newIndices.push_back(index);
+        newBasis.push_back(basisElem);
+        continue;
+      }
+    }
+    if (newIndices.size() == numIndices)
+      return failure();
+
+    if (newIndices.size() == 0) {
+      rewriter.replaceOpWithNewOp<arith::ConstantIndexOp>(op, 0);
+      return success();
+    }
+    rewriter.replaceOpWithNewOp<affine::AffineLinearizeIndexOp>(
+        op, newIndices, newBasis, op.getDisjoint());
     return success();
   }
 };
-
 } // namespace
 
-void affine::AffineDelinearizeIndexOp::getCanonicalizationPatterns(
+void affine::AffineLinearizeIndexOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
-  patterns.insert<DropDelinearizeOfSingleLoop, DropUnitExtentBasis>(context);
+  patterns.add<DropLinearizeUnitComponentsIfDisjointOrZero>(context);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp
index d76968d..1930e98 100644
--- a/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp
@@ -15,6 +15,7 @@
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/Transforms/Transforms.h"
 #include "mlir/Dialect/Affine/Utils.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
@@ -44,6 +45,23 @@ struct LowerDelinearizeIndexOps
   }
 };
 
+/// Lowers `affine.linearize_index` into a sequence of multiplications and
+/// additions.
+struct LowerLinearizeIndexOps final : OpRewritePattern<AffineLinearizeIndexOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(AffineLinearizeIndexOp op,
+                                PatternRewriter &rewriter) const override {
+    SmallVector<OpFoldResult> multiIndex =
+        getAsOpFoldResult(op.getMultiIndex());
+    OpFoldResult linearIndex =
+        linearizeIndex(rewriter, op.getLoc(), multiIndex, op.getMixedBasis());
+    Value linearIndexValue =
+        getValueOrCreateConstantIntOp(rewriter, op.getLoc(), linearIndex);
+    rewriter.replaceOp(op, linearIndexValue);
+    return success();
+  }
+};
+
 class ExpandAffineIndexOpsPass
     : public affine::impl::AffineExpandIndexOpsBase<ExpandAffineIndexOpsPass> {
 public:
@@ -63,7 +81,8 @@ public:
 
 void mlir::affine::populateAffineExpandIndexOpsPatterns(
     RewritePatternSet &patterns) {
-  patterns.insert<LowerDelinearizeIndexOps>(patterns.getContext());
+  patterns.insert<LowerDelinearizeIndexOps, LowerLinearizeIndexOps>(
+      patterns.getContext());
 }
 
 std::unique_ptr<Pass> mlir::affine::createAffineExpandIndexOpsPass() {
diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
index 2680502..7fe422f 100644
--- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
@@ -1999,6 +1999,12 @@ mlir::affine::delinearizeIndex(OpBuilder &b, Location loc, Value linearIndex,
 OpFoldResult mlir::affine::linearizeIndex(ArrayRef<OpFoldResult> multiIndex,
                                           ArrayRef<OpFoldResult> basis,
                                           ImplicitLocOpBuilder &builder) {
+  return linearizeIndex(builder, builder.getLoc(), multiIndex, basis);
+}
+
+OpFoldResult mlir::affine::linearizeIndex(OpBuilder &builder, Location loc,
+                                          ArrayRef<OpFoldResult> multiIndex,
+                                          ArrayRef<OpFoldResult> basis) {
   assert(multiIndex.size() == basis.size());
   SmallVector<AffineExpr> basisAffine;
   for (size_t i = 0; i < basis.size(); ++i) {
@@ -2009,13 +2015,13 @@ OpFoldResult mlir::affine::linearizeIndex(ArrayRef<OpFoldResult> multiIndex,
   SmallVector<OpFoldResult> strides;
   strides.reserve(stridesAffine.size());
   llvm::transform(stridesAffine, std::back_inserter(strides),
-                  [&builder, &basis](AffineExpr strideExpr) {
+                  [&builder, &basis, loc](AffineExpr strideExpr) {
                     return affine::makeComposedFoldedAffineApply(
-                        builder, builder.getLoc(), strideExpr, basis);
+                        builder, loc, strideExpr, basis);
                   });
 
   auto &&[linearIndexExpr, multiIndexAndStrides] = computeLinearIndex(
       OpFoldResult(builder.getIndexAttr(0)), strides, multiIndex);
-  return affine::makeComposedFoldedAffineApply(
-      builder, builder.getLoc(), linearIndexExpr, multiIndexAndStrides);
+  return affine::makeComposedFoldedAffineApply(builder, loc, linearIndexExpr,
+                                               multiIndexAndStrides);
 }
diff --git a/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt
index 93a004d..9128538 100644
--- a/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt
@@ -6,7 +6,6 @@ add_mlir_dialect_library(MLIRArithTransforms
   EmulateWideInt.cpp
   EmulateNarrowType.cpp
   ExpandOps.cpp
-  IntNarrowing.cpp
   IntRangeOptimizations.cpp
   ReifyValueBounds.cpp
   UnsignedWhenEquivalent.cpp
diff --git a/mlir/lib/Dialect/Arith/Transforms/IntNarrowing.cpp b/mlir/lib/Dialect/Arith/Transforms/IntNarrowing.cpp
deleted file mode 100644
index b61218b..0000000
--- a/mlir/lib/Dialect/Arith/Transforms/IntNarrowing.cpp
+++ /dev/null
@@ -1,790 +0,0 @@
-//===- IntNarrowing.cpp - Integer bitwidth reduction optimizations --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/Arith/Transforms/Passes.h"
-
-#include "mlir/Analysis/Presburger/IntegerRelation.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Transforms/Transforms.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinTypeInterfaces.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/Interfaces/ValueBoundsOpInterface.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include <cassert>
-#include <cstdint>
-
-namespace mlir::arith {
-#define GEN_PASS_DEF_ARITHINTNARROWING
-#include "mlir/Dialect/Arith/Transforms/Passes.h.inc"
-} // namespace mlir::arith
-
-namespace mlir::arith {
-namespace {
-//===----------------------------------------------------------------------===//
-// Common Helpers
-//===----------------------------------------------------------------------===//
-
-/// The base for integer bitwidth narrowing patterns.
-template <typename SourceOp>
-struct NarrowingPattern : OpRewritePattern<SourceOp> {
-  NarrowingPattern(MLIRContext *ctx, const ArithIntNarrowingOptions &options,
-                   PatternBenefit benefit = 1)
-      : OpRewritePattern<SourceOp>(ctx, benefit),
-        supportedBitwidths(options.bitwidthsSupported.begin(),
-                           options.bitwidthsSupported.end()) {
-    assert(!supportedBitwidths.empty() && "Invalid options");
-    assert(!llvm::is_contained(supportedBitwidths, 0) && "Invalid bitwidth");
-    llvm::sort(supportedBitwidths);
-  }
-
-  FailureOr<unsigned>
-  getNarrowestCompatibleBitwidth(unsigned bitsRequired) const {
-    for (unsigned candidate : supportedBitwidths)
-      if (candidate >= bitsRequired)
-        return candidate;
-
-    return failure();
-  }
-
-  /// Returns the narrowest supported type that fits `bitsRequired`.
-  FailureOr<Type> getNarrowType(unsigned bitsRequired, Type origTy) const {
-    assert(origTy);
-    FailureOr<unsigned> bestBitwidth =
-        getNarrowestCompatibleBitwidth(bitsRequired);
-    if (failed(bestBitwidth))
-      return failure();
-
-    Type elemTy = getElementTypeOrSelf(origTy);
-    if (!isa<IntegerType>(elemTy))
-      return failure();
-
-    auto newElemTy = IntegerType::get(origTy.getContext(), *bestBitwidth);
-    if (newElemTy == elemTy)
-      return failure();
-
-    if (origTy == elemTy)
-      return newElemTy;
-
-    if (auto shapedTy = dyn_cast<ShapedType>(origTy))
-      if (dyn_cast<IntegerType>(shapedTy.getElementType()))
-        return shapedTy.clone(shapedTy.getShape(), newElemTy);
-
-    return failure();
-  }
-
-private:
-  // Supported integer bitwidths in the ascending order.
-  llvm::SmallVector<unsigned, 6> supportedBitwidths;
-};
-
-/// Returns the integer bitwidth required to represent `type`.
-FailureOr<unsigned> calculateBitsRequired(Type type) {
-  assert(type);
-  if (auto intTy = dyn_cast<IntegerType>(getElementTypeOrSelf(type)))
-    return intTy.getWidth();
-
-  return failure();
-}
-
-enum class ExtensionKind { Sign, Zero };
-
-/// Wrapper around `arith::ExtSIOp` and `arith::ExtUIOp` ops that abstracts away
-/// the exact op type. Exposes helper functions to query the types, operands,
-/// and the result. This is so that we can handle both extension kinds without
-/// needing to use templates or branching.
-class ExtensionOp {
-public:
-  /// Attemps to create a new extension op from `op`. Returns an extension op
-  /// wrapper when `op` is either `arith.extsi` or `arith.extui`, and failure
-  /// otherwise.
-  static FailureOr<ExtensionOp> from(Operation *op) {
-    if (dyn_cast_or_null<arith::ExtSIOp>(op))
-      return ExtensionOp{op, ExtensionKind::Sign};
-    if (dyn_cast_or_null<arith::ExtUIOp>(op))
-      return ExtensionOp{op, ExtensionKind::Zero};
-
-    return failure();
-  }
-
-  ExtensionOp(const ExtensionOp &) = default;
-  ExtensionOp &operator=(const ExtensionOp &) = default;
-
-  /// Creates a new extension op of the same kind.
-  Operation *recreate(PatternRewriter &rewriter, Location loc, Type newType,
-                      Value in) {
-    if (kind == ExtensionKind::Sign)
-      return rewriter.create<arith::ExtSIOp>(loc, newType, in);
-
-    return rewriter.create<arith::ExtUIOp>(loc, newType, in);
-  }
-
-  /// Replaces `toReplace` with a new extension op of the same kind.
-  void recreateAndReplace(PatternRewriter &rewriter, Operation *toReplace,
-                          Value in) {
-    assert(toReplace->getNumResults() == 1);
-    Type newType = toReplace->getResult(0).getType();
-    Operation *newOp = recreate(rewriter, toReplace->getLoc(), newType, in);
-    rewriter.replaceOp(toReplace, newOp->getResult(0));
-  }
-
-  ExtensionKind getKind() { return kind; }
-
-  Value getResult() { return op->getResult(0); }
-  Value getIn() { return op->getOperand(0); }
-
-  Type getType() { return getResult().getType(); }
-  Type getElementType() { return getElementTypeOrSelf(getType()); }
-  Type getInType() { return getIn().getType(); }
-  Type getInElementType() { return getElementTypeOrSelf(getInType()); }
-
-private:
-  ExtensionOp(Operation *op, ExtensionKind kind) : op(op), kind(kind) {
-    assert(op);
-    assert((isa<arith::ExtSIOp, arith::ExtUIOp>(op)) && "Not an extension op");
-  }
-  Operation *op = nullptr;
-  ExtensionKind kind = {};
-};
-
-/// Returns the integer bitwidth required to represent `value`.
-unsigned calculateBitsRequired(const APInt &value,
-                               ExtensionKind lookThroughExtension) {
-  // For unsigned values, we only need the active bits. As a special case, zero
-  // requires one bit.
-  if (lookThroughExtension == ExtensionKind::Zero)
-    return std::max(value.getActiveBits(), 1u);
-
-  // If a signed value is nonnegative, we need one extra bit for the sign.
-  if (value.isNonNegative())
-    return value.getActiveBits() + 1;
-
-  // For the signed min, we need all the bits.
-  if (value.isMinSignedValue())
-    return value.getBitWidth();
-
-  // For negative values, we need all the non-sign bits and one extra bit for
-  // the sign.
-  return value.getBitWidth() - value.getNumSignBits() + 1;
-}
-
-/// Returns the integer bitwidth required to represent `value`.
-/// Looks through either sign- or zero-extension as specified by
-/// `lookThroughExtension`.
-FailureOr<unsigned> calculateBitsRequired(Value value,
-                                          ExtensionKind lookThroughExtension) {
-  // Handle constants.
-  if (TypedAttr attr; matchPattern(value, m_Constant(&attr))) {
-    if (auto intAttr = dyn_cast<IntegerAttr>(attr))
-      return calculateBitsRequired(intAttr.getValue(), lookThroughExtension);
-
-    if (auto elemsAttr = dyn_cast<DenseElementsAttr>(attr)) {
-      if (elemsAttr.getElementType().isIntOrIndex()) {
-        if (elemsAttr.isSplat())
-          return calculateBitsRequired(elemsAttr.getSplatValue<APInt>(),
-                                       lookThroughExtension);
-
-        unsigned maxBits = 1;
-        for (const APInt &elemValue : elemsAttr.getValues<APInt>())
-          maxBits = std::max(
-              maxBits, calculateBitsRequired(elemValue, lookThroughExtension));
-        return maxBits;
-      }
-    }
-  }
-
-  if (lookThroughExtension == ExtensionKind::Sign) {
-    if (auto sext = value.getDefiningOp<arith::ExtSIOp>())
-      return calculateBitsRequired(sext.getIn().getType());
-  } else if (lookThroughExtension == ExtensionKind::Zero) {
-    if (auto zext = value.getDefiningOp<arith::ExtUIOp>())
-      return calculateBitsRequired(zext.getIn().getType());
-  }
-
-  // If nothing else worked, return the type requirements for this element type.
-  return calculateBitsRequired(value.getType());
-}
-
-/// Base pattern for arith binary ops.
-/// Example:
-/// ```
-///   %lhs = arith.extsi %a : i8 to i32
-///   %rhs = arith.extsi %b : i8 to i32
-///   %r = arith.addi %lhs, %rhs : i32
-/// ==>
-///   %lhs = arith.extsi %a : i8 to i16
-///   %rhs = arith.extsi %b : i8 to i16
-///   %add = arith.addi %lhs, %rhs : i16
-///   %r = arith.extsi %add : i16 to i32
-/// ```
-template <typename BinaryOp>
-struct BinaryOpNarrowingPattern : NarrowingPattern<BinaryOp> {
-  using NarrowingPattern<BinaryOp>::NarrowingPattern;
-
-  /// Returns the number of bits required to represent the full result, assuming
-  /// that both operands are `operandBits`-wide. Derived classes must implement
-  /// this, taking into account `BinaryOp` semantics.
-  virtual unsigned getResultBitsProduced(unsigned operandBits) const = 0;
-
-  /// Customization point for patterns that should only apply with
-  /// zero/sign-extension ops as arguments.
-  virtual bool isSupported(ExtensionOp) const { return true; }
-
-  LogicalResult matchAndRewrite(BinaryOp op,
-                                PatternRewriter &rewriter) const final {
-    Type origTy = op.getType();
-    FailureOr<unsigned> resultBits = calculateBitsRequired(origTy);
-    if (failed(resultBits))
-      return failure();
-
-    // For the optimization to apply, we expect the lhs to be an extension op,
-    // and for the rhs to either be the same extension op or a constant.
-    FailureOr<ExtensionOp> ext = ExtensionOp::from(op.getLhs().getDefiningOp());
-    if (failed(ext) || !isSupported(*ext))
-      return failure();
-
-    FailureOr<unsigned> lhsBitsRequired =
-        calculateBitsRequired(ext->getIn(), ext->getKind());
-    if (failed(lhsBitsRequired) || *lhsBitsRequired >= *resultBits)
-      return failure();
-
-    FailureOr<unsigned> rhsBitsRequired =
-        calculateBitsRequired(op.getRhs(), ext->getKind());
-    if (failed(rhsBitsRequired) || *rhsBitsRequired >= *resultBits)
-      return failure();
-
-    // Negotiate a common bit requirements for both lhs and rhs, accounting for
-    // the result requiring more bits than the operands.
-    unsigned commonBitsRequired =
-        getResultBitsProduced(std::max(*lhsBitsRequired, *rhsBitsRequired));
-    FailureOr<Type> narrowTy = this->getNarrowType(commonBitsRequired, origTy);
-    if (failed(narrowTy) || calculateBitsRequired(*narrowTy) >= *resultBits)
-      return failure();
-
-    Location loc = op.getLoc();
-    Value newLhs =
-        rewriter.createOrFold<arith::TruncIOp>(loc, *narrowTy, op.getLhs());
-    Value newRhs =
-        rewriter.createOrFold<arith::TruncIOp>(loc, *narrowTy, op.getRhs());
-    Value newAdd = rewriter.create<BinaryOp>(loc, newLhs, newRhs);
-    ext->recreateAndReplace(rewriter, op, newAdd);
-    return success();
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// AddIOp Pattern
-//===----------------------------------------------------------------------===//
-
-struct AddIPattern final : BinaryOpNarrowingPattern<arith::AddIOp> {
-  using BinaryOpNarrowingPattern::BinaryOpNarrowingPattern;
-
-  // Addition may require one extra bit for the result.
-  // Example: `UINT8_MAX + 1 == 255 + 1 == 256`.
-  unsigned getResultBitsProduced(unsigned operandBits) const override {
-    return operandBits + 1;
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// SubIOp Pattern
-//===----------------------------------------------------------------------===//
-
-struct SubIPattern final : BinaryOpNarrowingPattern<arith::SubIOp> {
-  using BinaryOpNarrowingPattern::BinaryOpNarrowingPattern;
-
-  // This optimization only applies to signed arguments.
-  bool isSupported(ExtensionOp ext) const override {
-    return ext.getKind() == ExtensionKind::Sign;
-  }
-
-  // Subtraction may require one extra bit for the result.
-  // Example: `INT8_MAX - (-1) == 127 - (-1) == 128`.
-  unsigned getResultBitsProduced(unsigned operandBits) const override {
-    return operandBits + 1;
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// MulIOp Pattern
-//===----------------------------------------------------------------------===//
-
-struct MulIPattern final : BinaryOpNarrowingPattern<arith::MulIOp> {
-  using BinaryOpNarrowingPattern::BinaryOpNarrowingPattern;
-
-  // Multiplication may require up double the operand bits.
-  // Example: `UNT8_MAX * UINT8_MAX == 255 * 255 == 65025`.
-  unsigned getResultBitsProduced(unsigned operandBits) const override {
-    return 2 * operandBits;
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// DivSIOp Pattern
-//===----------------------------------------------------------------------===//
-
-struct DivSIPattern final : BinaryOpNarrowingPattern<arith::DivSIOp> {
-  using BinaryOpNarrowingPattern::BinaryOpNarrowingPattern;
-
-  // This optimization only applies to signed arguments.
-  bool isSupported(ExtensionOp ext) const override {
-    return ext.getKind() == ExtensionKind::Sign;
-  }
-
-  // Unlike multiplication, signed division requires only one more result bit.
-  // Example: `INT8_MIN / (-1) == -128 / (-1) == 128`.
-  unsigned getResultBitsProduced(unsigned operandBits) const override {
-    return operandBits + 1;
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// DivUIOp Pattern
-//===----------------------------------------------------------------------===//
-
-struct DivUIPattern final : BinaryOpNarrowingPattern<arith::DivUIOp> {
-  using BinaryOpNarrowingPattern::BinaryOpNarrowingPattern;
-
-  // This optimization only applies to unsigned arguments.
-  bool isSupported(ExtensionOp ext) const override {
-    return ext.getKind() == ExtensionKind::Zero;
-  }
-
-  // Unsigned division does not require any extra result bits.
-  unsigned getResultBitsProduced(unsigned operandBits) const override {
-    return operandBits;
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Min/Max Patterns
-//===----------------------------------------------------------------------===//
-
-template <typename MinMaxOp, ExtensionKind Kind>
-struct MinMaxPattern final : BinaryOpNarrowingPattern<MinMaxOp> {
-  using BinaryOpNarrowingPattern<MinMaxOp>::BinaryOpNarrowingPattern;
-
-  bool isSupported(ExtensionOp ext) const override {
-    return ext.getKind() == Kind;
-  }
-
-  // Min/max returns one of the arguments and does not require any extra result
-  // bits.
-  unsigned getResultBitsProduced(unsigned operandBits) const override {
-    return operandBits;
-  }
-};
-using MaxSIPattern = MinMaxPattern<arith::MaxSIOp, ExtensionKind::Sign>;
-using MaxUIPattern = MinMaxPattern<arith::MaxUIOp, ExtensionKind::Zero>;
-using MinSIPattern = MinMaxPattern<arith::MinSIOp, ExtensionKind::Sign>;
-using MinUIPattern = MinMaxPattern<arith::MinUIOp, ExtensionKind::Zero>;
-
-//===----------------------------------------------------------------------===//
-// *IToFPOp Patterns
-//===----------------------------------------------------------------------===//
-
-template <typename IToFPOp, ExtensionKind Extension>
-struct IToFPPattern final : NarrowingPattern<IToFPOp> {
-  using NarrowingPattern<IToFPOp>::NarrowingPattern;
-
-  LogicalResult matchAndRewrite(IToFPOp op,
-                                PatternRewriter &rewriter) const override {
-    FailureOr<unsigned> narrowestWidth =
-        calculateBitsRequired(op.getIn(), Extension);
-    if (failed(narrowestWidth))
-      return failure();
-
-    FailureOr<Type> narrowTy =
-        this->getNarrowType(*narrowestWidth, op.getIn().getType());
-    if (failed(narrowTy))
-      return failure();
-
-    Value newIn = rewriter.createOrFold<arith::TruncIOp>(op.getLoc(), *narrowTy,
-                                                         op.getIn());
-    rewriter.replaceOpWithNewOp<IToFPOp>(op, op.getType(), newIn);
-    return success();
-  }
-};
-using SIToFPPattern = IToFPPattern<arith::SIToFPOp, ExtensionKind::Sign>;
-using UIToFPPattern = IToFPPattern<arith::UIToFPOp, ExtensionKind::Zero>;
-
-//===----------------------------------------------------------------------===//
-// Index Cast Patterns
-//===----------------------------------------------------------------------===//
-
-// These rely on the `ValueBounds` interface for index values. For example, we
-// can often statically tell index value bounds of loop induction variables.
-
-template <typename CastOp, ExtensionKind Kind>
-struct IndexCastPattern final : NarrowingPattern<CastOp> {
-  using NarrowingPattern<CastOp>::NarrowingPattern;
-
-  LogicalResult matchAndRewrite(CastOp op,
-                                PatternRewriter &rewriter) const override {
-    Value in = op.getIn();
-    // We only support scalar index -> integer casts.
-    if (!isa<IndexType>(in.getType()))
-      return failure();
-
-    // Check the lower bound in both the signed and unsigned cast case. We
-    // conservatively assume that even unsigned casts may be performed on
-    // negative indices.
-    FailureOr<int64_t> lb = ValueBoundsConstraintSet::computeConstantBound(
-        presburger::BoundType::LB, in);
-    if (failed(lb))
-      return failure();
-
-    FailureOr<int64_t> ub = ValueBoundsConstraintSet::computeConstantBound(
-        presburger::BoundType::UB, in,
-        /*stopCondition=*/nullptr, /*closedUB=*/true);
-    if (failed(ub))
-      return failure();
-
-    assert(*lb <= *ub && "Invalid bounds");
-    unsigned lbBitsRequired = calculateBitsRequired(APInt(64, *lb), Kind);
-    unsigned ubBitsRequired = calculateBitsRequired(APInt(64, *ub), Kind);
-    unsigned bitsRequired = std::max(lbBitsRequired, ubBitsRequired);
-
-    IntegerType resultTy = cast<IntegerType>(op.getType());
-    if (resultTy.getWidth() <= bitsRequired)
-      return failure();
-
-    FailureOr<Type> narrowTy = this->getNarrowType(bitsRequired, resultTy);
-    if (failed(narrowTy))
-      return failure();
-
-    Value newCast = rewriter.create<CastOp>(op.getLoc(), *narrowTy, op.getIn());
-
-    if (Kind == ExtensionKind::Sign)
-      rewriter.replaceOpWithNewOp<arith::ExtSIOp>(op, resultTy, newCast);
-    else
-      rewriter.replaceOpWithNewOp<arith::ExtUIOp>(op, resultTy, newCast);
-    return success();
-  }
-};
-using IndexCastSIPattern =
-    IndexCastPattern<arith::IndexCastOp, ExtensionKind::Sign>;
-using IndexCastUIPattern =
-    IndexCastPattern<arith::IndexCastUIOp, ExtensionKind::Zero>;
-
-//===----------------------------------------------------------------------===//
-// Patterns to Commute Extension Ops
-//===----------------------------------------------------------------------===//
-
-struct ExtensionOverBroadcast final : NarrowingPattern<vector::BroadcastOp> {
-  using NarrowingPattern::NarrowingPattern;
-
-  LogicalResult matchAndRewrite(vector::BroadcastOp op,
-                                PatternRewriter &rewriter) const override {
-    FailureOr<ExtensionOp> ext =
-        ExtensionOp::from(op.getSource().getDefiningOp());
-    if (failed(ext))
-      return failure();
-
-    VectorType origTy = op.getResultVectorType();
-    VectorType newTy =
-        origTy.cloneWith(origTy.getShape(), ext->getInElementType());
-    Value newBroadcast =
-        rewriter.create<vector::BroadcastOp>(op.getLoc(), newTy, ext->getIn());
-    ext->recreateAndReplace(rewriter, op, newBroadcast);
-    return success();
-  }
-};
-
-struct ExtensionOverExtract final : NarrowingPattern<vector::ExtractOp> {
-  using NarrowingPattern::NarrowingPattern;
-
-  LogicalResult matchAndRewrite(vector::ExtractOp op,
-                                PatternRewriter &rewriter) const override {
-    FailureOr<ExtensionOp> ext =
-        ExtensionOp::from(op.getVector().getDefiningOp());
-    if (failed(ext))
-      return failure();
-
-    Value newExtract = rewriter.create<vector::ExtractOp>(
-        op.getLoc(), ext->getIn(), op.getMixedPosition());
-    ext->recreateAndReplace(rewriter, op, newExtract);
-    return success();
-  }
-};
-
-struct ExtensionOverExtractElement final
-    : NarrowingPattern<vector::ExtractElementOp> {
-  using NarrowingPattern::NarrowingPattern;
-
-  LogicalResult matchAndRewrite(vector::ExtractElementOp op,
-                                PatternRewriter &rewriter) const override {
-    FailureOr<ExtensionOp> ext =
-        ExtensionOp::from(op.getVector().getDefiningOp());
-    if (failed(ext))
-      return failure();
-
-    Value newExtract = rewriter.create<vector::ExtractElementOp>(
-        op.getLoc(), ext->getIn(), op.getPosition());
-    ext->recreateAndReplace(rewriter, op, newExtract);
-    return success();
-  }
-};
-
-struct ExtensionOverExtractStridedSlice final
-    : NarrowingPattern<vector::ExtractStridedSliceOp> {
-  using NarrowingPattern::NarrowingPattern;
-
-  LogicalResult matchAndRewrite(vector::ExtractStridedSliceOp op,
-                                PatternRewriter &rewriter) const override {
-    FailureOr<ExtensionOp> ext =
-        ExtensionOp::from(op.getVector().getDefiningOp());
-    if (failed(ext))
-      return failure();
-
-    VectorType origTy = op.getType();
-    VectorType extractTy =
-        origTy.cloneWith(origTy.getShape(), ext->getInElementType());
-    Value newExtract = rewriter.create<vector::ExtractStridedSliceOp>(
-        op.getLoc(), extractTy, ext->getIn(), op.getOffsets(), op.getSizes(),
-        op.getStrides());
-    ext->recreateAndReplace(rewriter, op, newExtract);
-    return success();
-  }
-};
-
-/// Base pattern for `vector.insert` narrowing patterns.
-template <typename InsertionOp>
-struct ExtensionOverInsertionPattern : NarrowingPattern<InsertionOp> {
-  using NarrowingPattern<InsertionOp>::NarrowingPattern;
-
-  /// Derived classes must provide a function to create the matching insertion
-  /// op based on the original op and new arguments.
-  virtual InsertionOp createInsertionOp(PatternRewriter &rewriter,
-                                        InsertionOp origInsert,
-                                        Value narrowValue,
-                                        Value narrowDest) const = 0;
-
-  LogicalResult matchAndRewrite(InsertionOp op,
-                                PatternRewriter &rewriter) const final {
-    FailureOr<ExtensionOp> ext =
-        ExtensionOp::from(op.getSource().getDefiningOp());
-    if (failed(ext))
-      return failure();
-
-    FailureOr<InsertionOp> newInsert = createNarrowInsert(op, rewriter, *ext);
-    if (failed(newInsert))
-      return failure();
-    ext->recreateAndReplace(rewriter, op, *newInsert);
-    return success();
-  }
-
-  FailureOr<InsertionOp> createNarrowInsert(InsertionOp op,
-                                            PatternRewriter &rewriter,
-                                            ExtensionOp insValue) const {
-    // Calculate the operand and result bitwidths. We can only apply narrowing
-    // when the inserted source value and destination vector require fewer bits
-    // than the result. Because the source and destination may have different
-    // bitwidths requirements, we have to find the common narrow bitwidth that
-    // is greater equal to the operand bitwidth requirements and still narrower
-    // than the result.
-    FailureOr<unsigned> origBitsRequired = calculateBitsRequired(op.getType());
-    if (failed(origBitsRequired))
-      return failure();
-
-    // TODO: We could relax this check by disregarding bitwidth requirements of
-    // elements that we know will be replaced by the insertion.
-    FailureOr<unsigned> destBitsRequired =
-        calculateBitsRequired(op.getDest(), insValue.getKind());
-    if (failed(destBitsRequired) || *destBitsRequired >= *origBitsRequired)
-      return failure();
-
-    FailureOr<unsigned> insertedBitsRequired =
-        calculateBitsRequired(insValue.getIn(), insValue.getKind());
-    if (failed(insertedBitsRequired) ||
-        *insertedBitsRequired >= *origBitsRequired)
-      return failure();
-
-    // Find a narrower element type that satisfies the bitwidth requirements of
-    // both the source and the destination values.
-    unsigned newInsertionBits =
-        std::max(*destBitsRequired, *insertedBitsRequired);
-    FailureOr<Type> newVecTy =
-        this->getNarrowType(newInsertionBits, op.getType());
-    if (failed(newVecTy) || *newVecTy == op.getType())
-      return failure();
-
-    FailureOr<Type> newInsertedValueTy =
-        this->getNarrowType(newInsertionBits, insValue.getType());
-    if (failed(newInsertedValueTy))
-      return failure();
-
-    Location loc = op.getLoc();
-    Value narrowValue = rewriter.createOrFold<arith::TruncIOp>(
-        loc, *newInsertedValueTy, insValue.getResult());
-    Value narrowDest =
-        rewriter.createOrFold<arith::TruncIOp>(loc, *newVecTy, op.getDest());
-    return createInsertionOp(rewriter, op, narrowValue, narrowDest);
-  }
-};
-
-struct ExtensionOverInsert final
-    : ExtensionOverInsertionPattern<vector::InsertOp> {
-  using ExtensionOverInsertionPattern::ExtensionOverInsertionPattern;
-
-  vector::InsertOp createInsertionOp(PatternRewriter &rewriter,
-                                     vector::InsertOp origInsert,
-                                     Value narrowValue,
-                                     Value narrowDest) const override {
-    return rewriter.create<vector::InsertOp>(origInsert.getLoc(), narrowValue,
-                                             narrowDest,
-                                             origInsert.getMixedPosition());
-  }
-};
-
-struct ExtensionOverInsertElement final
-    : ExtensionOverInsertionPattern<vector::InsertElementOp> {
-  using ExtensionOverInsertionPattern::ExtensionOverInsertionPattern;
-
-  vector::InsertElementOp createInsertionOp(PatternRewriter &rewriter,
-                                            vector::InsertElementOp origInsert,
-                                            Value narrowValue,
-                                            Value narrowDest) const override {
-    return rewriter.create<vector::InsertElementOp>(
-        origInsert.getLoc(), narrowValue, narrowDest, origInsert.getPosition());
-  }
-};
-
-struct ExtensionOverInsertStridedSlice final
-    : ExtensionOverInsertionPattern<vector::InsertStridedSliceOp> {
-  using ExtensionOverInsertionPattern::ExtensionOverInsertionPattern;
-
-  vector::InsertStridedSliceOp
-  createInsertionOp(PatternRewriter &rewriter,
-                    vector::InsertStridedSliceOp origInsert, Value narrowValue,
-                    Value narrowDest) const override {
-    return rewriter.create<vector::InsertStridedSliceOp>(
-        origInsert.getLoc(), narrowValue, narrowDest, origInsert.getOffsets(),
-        origInsert.getStrides());
-  }
-};
-
-struct ExtensionOverShapeCast final : NarrowingPattern<vector::ShapeCastOp> {
-  using NarrowingPattern::NarrowingPattern;
-
-  LogicalResult matchAndRewrite(vector::ShapeCastOp op,
-                                PatternRewriter &rewriter) const override {
-    FailureOr<ExtensionOp> ext =
-        ExtensionOp::from(op.getSource().getDefiningOp());
-    if (failed(ext))
-      return failure();
-
-    VectorType origTy = op.getResultVectorType();
-    VectorType newTy =
-        origTy.cloneWith(origTy.getShape(), ext->getInElementType());
-    Value newCast =
-        rewriter.create<vector::ShapeCastOp>(op.getLoc(), newTy, ext->getIn());
-    ext->recreateAndReplace(rewriter, op, newCast);
-    return success();
-  }
-};
-
-struct ExtensionOverTranspose final : NarrowingPattern<vector::TransposeOp> {
-  using NarrowingPattern::NarrowingPattern;
-
-  LogicalResult matchAndRewrite(vector::TransposeOp op,
-                                PatternRewriter &rewriter) const override {
-    FailureOr<ExtensionOp> ext =
-        ExtensionOp::from(op.getVector().getDefiningOp());
-    if (failed(ext))
-      return failure();
-
-    VectorType origTy = op.getResultVectorType();
-    VectorType newTy =
-        origTy.cloneWith(origTy.getShape(), ext->getInElementType());
-    Value newTranspose = rewriter.create<vector::TransposeOp>(
-        op.getLoc(), newTy, ext->getIn(), op.getPermutation());
-    ext->recreateAndReplace(rewriter, op, newTranspose);
-    return success();
-  }
-};
-
-struct ExtensionOverFlatTranspose final
-    : NarrowingPattern<vector::FlatTransposeOp> {
-  using NarrowingPattern::NarrowingPattern;
-
-  LogicalResult matchAndRewrite(vector::FlatTransposeOp op,
-                                PatternRewriter &rewriter) const override {
-    FailureOr<ExtensionOp> ext =
-        ExtensionOp::from(op.getMatrix().getDefiningOp());
-    if (failed(ext))
-      return failure();
-
-    VectorType origTy = op.getType();
-    VectorType newTy =
-        origTy.cloneWith(origTy.getShape(), ext->getInElementType());
-    Value newTranspose = rewriter.create<vector::FlatTransposeOp>(
-        op.getLoc(), newTy, ext->getIn(), op.getRowsAttr(),
-        op.getColumnsAttr());
-    ext->recreateAndReplace(rewriter, op, newTranspose);
-    return success();
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Pass Definitions
-//===----------------------------------------------------------------------===//
-
-struct ArithIntNarrowingPass final
-    : impl::ArithIntNarrowingBase<ArithIntNarrowingPass> {
-  using ArithIntNarrowingBase::ArithIntNarrowingBase;
-
-  void runOnOperation() override {
-    if (bitwidthsSupported.empty() ||
-        llvm::is_contained(bitwidthsSupported, 0)) {
-      // Invalid pass options.
-      return signalPassFailure();
-    }
-
-    Operation *op = getOperation();
-    MLIRContext *ctx = op->getContext();
-    RewritePatternSet patterns(ctx);
-    populateArithIntNarrowingPatterns(
-        patterns, ArithIntNarrowingOptions{
-                      llvm::to_vector_of<unsigned>(bitwidthsSupported)});
-    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
-      signalPassFailure();
-  }
-};
-} // namespace
-
-//===----------------------------------------------------------------------===//
-// Public API
-//===----------------------------------------------------------------------===//
-
-void populateArithIntNarrowingPatterns(
-    RewritePatternSet &patterns, const ArithIntNarrowingOptions &options) {
-  // Add commute patterns with a higher benefit. This is to expose more
-  // optimization opportunities to narrowing patterns.
-  patterns.add<ExtensionOverBroadcast, ExtensionOverExtract,
-               ExtensionOverExtractElement, ExtensionOverExtractStridedSlice,
-               ExtensionOverInsert, ExtensionOverInsertElement,
-               ExtensionOverInsertStridedSlice, ExtensionOverShapeCast,
-               ExtensionOverTranspose, ExtensionOverFlatTranspose>(
-      patterns.getContext(), options, PatternBenefit(2));
-
-  patterns.add<AddIPattern, SubIPattern, MulIPattern, DivSIPattern,
-               DivUIPattern, MaxSIPattern, MaxUIPattern, MinSIPattern,
-               MinUIPattern, SIToFPPattern, UIToFPPattern, IndexCastSIPattern,
-               IndexCastUIPattern>(patterns.getContext(), options);
-}
-
-} // namespace mlir::arith
diff --git a/mlir/lib/Dialect/Arith/Transforms/IntRangeOptimizations.cpp b/mlir/lib/Dialect/Arith/Transforms/IntRangeOptimizations.cpp
index d494bba..efc4db7 100644
--- a/mlir/lib/Dialect/Arith/Transforms/IntRangeOptimizations.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/IntRangeOptimizations.cpp
@@ -15,8 +15,10 @@
 #include "mlir/Analysis/DataFlow/IntegerRangeAnalysis.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Transforms/FoldUtils.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -24,6 +26,9 @@
 namespace mlir::arith {
 #define GEN_PASS_DEF_ARITHINTRANGEOPTS
 #include "mlir/Dialect/Arith/Transforms/Passes.h.inc"
+
+#define GEN_PASS_DEF_ARITHINTRANGENARROWING
+#include "mlir/Dialect/Arith/Transforms/Passes.h.inc"
 } // namespace mlir::arith
 
 using namespace mlir;
@@ -190,8 +195,264 @@ private:
   DataFlowSolver &solver;
 };
 
-struct IntRangeOptimizationsPass
-    : public arith::impl::ArithIntRangeOptsBase<IntRangeOptimizationsPass> {
+/// Check if `type` is index or integer type with `getWidth() > targetBitwidth`.
+static LogicalResult checkIntType(Type type, unsigned targetBitwidth) {
+  Type elemType = getElementTypeOrSelf(type);
+  if (isa<IndexType>(elemType))
+    return success();
+
+  if (auto intType = dyn_cast<IntegerType>(elemType))
+    if (intType.getWidth() > targetBitwidth)
+      return success();
+
+  return failure();
+}
+
+/// Check if op have same type for all operands and results and this type
+/// is suitable for truncation.
+static LogicalResult checkElementwiseOpType(Operation *op,
+                                            unsigned targetBitwidth) {
+  if (op->getNumOperands() == 0 || op->getNumResults() == 0)
+    return failure();
+
+  Type type;
+  for (Value val : llvm::concat<Value>(op->getOperands(), op->getResults())) {
+    if (!type) {
+      type = val.getType();
+      continue;
+    }
+
+    if (type != val.getType())
+      return failure();
+  }
+
+  return checkIntType(type, targetBitwidth);
+}
+
+/// Return union of all operands values ranges.
+static std::optional<ConstantIntRanges> getOperandsRange(DataFlowSolver &solver,
+                                                         ValueRange operands) {
+  std::optional<ConstantIntRanges> ret;
+  for (Value value : operands) {
+    auto *maybeInferredRange =
+        solver.lookupState<IntegerValueRangeLattice>(value);
+    if (!maybeInferredRange || maybeInferredRange->getValue().isUninitialized())
+      return std::nullopt;
+
+    const ConstantIntRanges &inferredRange =
+        maybeInferredRange->getValue().getValue();
+
+    ret = (ret ? ret->rangeUnion(inferredRange) : inferredRange);
+  }
+  return ret;
+}
+
+/// Return int type truncated to `targetBitwidth`. If `srcType` is shaped,
+/// return shaped type as well.
+static Type getTargetType(Type srcType, unsigned targetBitwidth) {
+  auto dstType = IntegerType::get(srcType.getContext(), targetBitwidth);
+  if (auto shaped = dyn_cast<ShapedType>(srcType))
+    return shaped.clone(dstType);
+
+  assert(srcType.isIntOrIndex() && "Invalid src type");
+  return dstType;
+}
+
+/// Check provided `range` is inside `smin, smax, umin, umax` bounds.
+static LogicalResult checkRange(const ConstantIntRanges &range, APInt smin,
+                                APInt smax, APInt umin, APInt umax) {
+  auto sge = [](APInt val1, APInt val2) -> bool {
+    unsigned width = std::max(val1.getBitWidth(), val2.getBitWidth());
+    val1 = val1.sext(width);
+    val2 = val2.sext(width);
+    return val1.sge(val2);
+  };
+  auto sle = [](APInt val1, APInt val2) -> bool {
+    unsigned width = std::max(val1.getBitWidth(), val2.getBitWidth());
+    val1 = val1.sext(width);
+    val2 = val2.sext(width);
+    return val1.sle(val2);
+  };
+  auto uge = [](APInt val1, APInt val2) -> bool {
+    unsigned width = std::max(val1.getBitWidth(), val2.getBitWidth());
+    val1 = val1.zext(width);
+    val2 = val2.zext(width);
+    return val1.uge(val2);
+  };
+  auto ule = [](APInt val1, APInt val2) -> bool {
+    unsigned width = std::max(val1.getBitWidth(), val2.getBitWidth());
+    val1 = val1.zext(width);
+    val2 = val2.zext(width);
+    return val1.ule(val2);
+  };
+  return success(sge(range.smin(), smin) && sle(range.smax(), smax) &&
+                 uge(range.umin(), umin) && ule(range.umax(), umax));
+}
+
+static Value doCast(OpBuilder &builder, Location loc, Value src, Type dstType) {
+  Type srcType = src.getType();
+  assert(isa<VectorType>(srcType) == isa<VectorType>(dstType) &&
+         "Mixing vector and non-vector types");
+  Type srcElemType = getElementTypeOrSelf(srcType);
+  Type dstElemType = getElementTypeOrSelf(dstType);
+  assert(srcElemType.isIntOrIndex() && "Invalid src type");
+  assert(dstElemType.isIntOrIndex() && "Invalid dst type");
+  if (srcType == dstType)
+    return src;
+
+  if (isa<IndexType>(srcElemType) || isa<IndexType>(dstElemType))
+    return builder.create<arith::IndexCastUIOp>(loc, dstType, src);
+
+  auto srcInt = cast<IntegerType>(srcElemType);
+  auto dstInt = cast<IntegerType>(dstElemType);
+  if (dstInt.getWidth() < srcInt.getWidth())
+    return builder.create<arith::TruncIOp>(loc, dstType, src);
+
+  return builder.create<arith::ExtUIOp>(loc, dstType, src);
+}
+
+struct NarrowElementwise final : OpTraitRewritePattern<OpTrait::Elementwise> {
+  NarrowElementwise(MLIRContext *context, DataFlowSolver &s,
+                    ArrayRef<unsigned> target)
+      : OpTraitRewritePattern(context), solver(s), targetBitwidths(target) {}
+
+  using OpTraitRewritePattern::OpTraitRewritePattern;
+  LogicalResult matchAndRewrite(Operation *op,
+                                PatternRewriter &rewriter) const override {
+    std::optional<ConstantIntRanges> range =
+        getOperandsRange(solver, op->getResults());
+    if (!range)
+      return failure();
+
+    for (unsigned targetBitwidth : targetBitwidths) {
+      if (failed(checkElementwiseOpType(op, targetBitwidth)))
+        continue;
+
+      Type srcType = op->getResult(0).getType();
+
+      // We are truncating op args to the desired bitwidth before the op and
+      // then extending op results back to the original width after. extui and
+      // exti will produce different results for negative values, so limit
+      // signed range to non-negative values.
+      auto smin = APInt::getZero(targetBitwidth);
+      auto smax = APInt::getSignedMaxValue(targetBitwidth);
+      auto umin = APInt::getMinValue(targetBitwidth);
+      auto umax = APInt::getMaxValue(targetBitwidth);
+      if (failed(checkRange(*range, smin, smax, umin, umax)))
+        continue;
+
+      Type targetType = getTargetType(srcType, targetBitwidth);
+      if (targetType == srcType)
+        continue;
+
+      Location loc = op->getLoc();
+      IRMapping mapping;
+      for (Value arg : op->getOperands()) {
+        Value newArg = doCast(rewriter, loc, arg, targetType);
+        mapping.map(arg, newArg);
+      }
+
+      Operation *newOp = rewriter.clone(*op, mapping);
+      rewriter.modifyOpInPlace(newOp, [&]() {
+        for (OpResult res : newOp->getResults()) {
+          res.setType(targetType);
+        }
+      });
+      SmallVector<Value> newResults;
+      for (Value res : newOp->getResults())
+        newResults.emplace_back(doCast(rewriter, loc, res, srcType));
+
+      rewriter.replaceOp(op, newResults);
+      return success();
+    }
+    return failure();
+  }
+
+private:
+  DataFlowSolver &solver;
+  SmallVector<unsigned, 4> targetBitwidths;
+};
+
+struct NarrowCmpI final : OpRewritePattern<arith::CmpIOp> {
+  NarrowCmpI(MLIRContext *context, DataFlowSolver &s, ArrayRef<unsigned> target)
+      : OpRewritePattern(context), solver(s), targetBitwidths(target) {}
+
+  LogicalResult matchAndRewrite(arith::CmpIOp op,
+                                PatternRewriter &rewriter) const override {
+    Value lhs = op.getLhs();
+    Value rhs = op.getRhs();
+
+    std::optional<ConstantIntRanges> range =
+        getOperandsRange(solver, {lhs, rhs});
+    if (!range)
+      return failure();
+
+    for (unsigned targetBitwidth : targetBitwidths) {
+      Type srcType = lhs.getType();
+      if (failed(checkIntType(srcType, targetBitwidth)))
+        continue;
+
+      auto smin = APInt::getSignedMinValue(targetBitwidth);
+      auto smax = APInt::getSignedMaxValue(targetBitwidth);
+      auto umin = APInt::getMinValue(targetBitwidth);
+      auto umax = APInt::getMaxValue(targetBitwidth);
+      if (failed(checkRange(*range, smin, smax, umin, umax)))
+        continue;
+
+      Type targetType = getTargetType(srcType, targetBitwidth);
+      if (targetType == srcType)
+        continue;
+
+      Location loc = op->getLoc();
+      IRMapping mapping;
+      for (Value arg : op->getOperands()) {
+        Value newArg = doCast(rewriter, loc, arg, targetType);
+        mapping.map(arg, newArg);
+      }
+
+      Operation *newOp = rewriter.clone(*op, mapping);
+      rewriter.replaceOp(op, newOp->getResults());
+      return success();
+    }
+    return failure();
+  }
+
+private:
+  DataFlowSolver &solver;
+  SmallVector<unsigned, 4> targetBitwidths;
+};
+
+/// Fold index_cast(index_cast(%arg: i8, index), i8) -> %arg
+/// This pattern assumes all passed `targetBitwidths` are not wider than index
+/// type.
+struct FoldIndexCastChain final : OpRewritePattern<arith::IndexCastUIOp> {
+  FoldIndexCastChain(MLIRContext *context, ArrayRef<unsigned> target)
+      : OpRewritePattern(context), targetBitwidths(target) {}
+
+  LogicalResult matchAndRewrite(arith::IndexCastUIOp op,
+                                PatternRewriter &rewriter) const override {
+    auto srcOp = op.getIn().getDefiningOp<arith::IndexCastUIOp>();
+    if (!srcOp)
+      return failure();
+
+    Value src = srcOp.getIn();
+    if (src.getType() != op.getType())
+      return failure();
+
+    auto intType = dyn_cast<IntegerType>(op.getType());
+    if (!intType || !llvm::is_contained(targetBitwidths, intType.getWidth()))
+      return failure();
+
+    rewriter.replaceOp(op, src);
+    return success();
+  }
+
+private:
+  SmallVector<unsigned, 4> targetBitwidths;
+};
+
+struct IntRangeOptimizationsPass final
+    : arith::impl::ArithIntRangeOptsBase<IntRangeOptimizationsPass> {
 
   void runOnOperation() override {
     Operation *op = getOperation();
@@ -214,6 +475,35 @@ struct IntRangeOptimizationsPass
       signalPassFailure();
   }
 };
+
+struct IntRangeNarrowingPass final
+    : arith::impl::ArithIntRangeNarrowingBase<IntRangeNarrowingPass> {
+  using ArithIntRangeNarrowingBase::ArithIntRangeNarrowingBase;
+
+  void runOnOperation() override {
+    Operation *op = getOperation();
+    MLIRContext *ctx = op->getContext();
+    DataFlowSolver solver;
+    solver.load<DeadCodeAnalysis>();
+    solver.load<IntegerRangeAnalysis>();
+    if (failed(solver.initializeAndRun(op)))
+      return signalPassFailure();
+
+    DataFlowListener listener(solver);
+
+    RewritePatternSet patterns(ctx);
+    populateIntRangeNarrowingPatterns(patterns, solver, bitwidthsSupported);
+
+    GreedyRewriteConfig config;
+    // We specifically need bottom-up traversal as cmpi pattern needs range
+    // data, attached to its original argument values.
+    config.useTopDownTraversal = false;
+    config.listener = &listener;
+
+    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns), config)))
+      signalPassFailure();
+  }
+};
 } // namespace
 
 void mlir::arith::populateIntRangeOptimizationsPatterns(
@@ -222,6 +512,14 @@ void mlir::arith::populateIntRangeOptimizationsPatterns(
                DeleteTrivialRem<RemUIOp>>(patterns.getContext(), solver);
 }
 
+void mlir::arith::populateIntRangeNarrowingPatterns(
+    RewritePatternSet &patterns, DataFlowSolver &solver,
+    ArrayRef<unsigned> bitwidthsSupported) {
+  patterns.add<NarrowElementwise, NarrowCmpI>(patterns.getContext(), solver,
+                                              bitwidthsSupported);
+  patterns.add<FoldIndexCastChain>(patterns.getContext(), bitwidthsSupported);
+}
+
 std::unique_ptr<Pass> mlir::arith::createIntRangeOptimizationsPass() {
   return std::make_unique<IntRangeOptimizationsPass>();
 }
diff --git a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
index f050355..24c892f 100644
--- a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
@@ -43,20 +43,17 @@ using namespace mlir::vector;
 struct VectorShape {
   ArrayRef<int64_t> sizes;
   ArrayRef<bool> scalableFlags;
-
-  bool empty() const { return sizes.empty(); }
 };
 
-// Returns vector shape if the type is a vector. Returns an empty shape if it is
-// not a vector.
-static VectorShape vectorShape(Type type) {
-  auto vectorType = dyn_cast<VectorType>(type);
-  return vectorType
-             ? VectorShape{vectorType.getShape(), vectorType.getScalableDims()}
-             : VectorShape{};
+// Returns vector shape if the type is a vector, otherwise return nullopt.
+static std::optional<VectorShape> vectorShape(Type type) {
+  if (auto vectorType = dyn_cast<VectorType>(type)) {
+    return VectorShape{vectorType.getShape(), vectorType.getScalableDims()};
+  }
+  return std::nullopt;
 }
 
-static VectorShape vectorShape(Value value) {
+static std::optional<VectorShape> vectorShape(Value value) {
   return vectorShape(value.getType());
 }
 
@@ -65,19 +62,18 @@ static VectorShape vectorShape(Value value) {
 //----------------------------------------------------------------------------//
 
 // Broadcasts scalar type into vector type (iff shape is non-scalar).
-static Type broadcast(Type type, VectorShape shape) {
+static Type broadcast(Type type, std::optional<VectorShape> shape) {
   assert(!isa<VectorType>(type) && "must be scalar type");
-  return !shape.empty()
-             ? VectorType::get(shape.sizes, type, shape.scalableFlags)
-             : type;
+  return shape ? VectorType::get(shape->sizes, type, shape->scalableFlags)
+               : type;
 }
 
 // Broadcasts scalar value into vector (iff shape is non-scalar).
 static Value broadcast(ImplicitLocOpBuilder &builder, Value value,
-                       VectorShape shape) {
+                       std::optional<VectorShape> shape) {
   assert(!isa<VectorType>(value.getType()) && "must be scalar value");
   auto type = broadcast(value.getType(), shape);
-  return !shape.empty() ? builder.create<BroadcastOp>(type, value) : value;
+  return shape ? builder.create<BroadcastOp>(type, value) : value;
 }
 
 //----------------------------------------------------------------------------//
@@ -227,7 +223,7 @@ static Value clamp(ImplicitLocOpBuilder &builder, Value value, Value lowerBound,
 static std::pair<Value, Value> frexp(ImplicitLocOpBuilder &builder, Value arg,
                                      bool isPositive = false) {
   assert(getElementTypeOrSelf(arg).isF32() && "arg must be f32 type");
-  VectorShape shape = vectorShape(arg);
+  std::optional<VectorShape> shape = vectorShape(arg);
 
   auto bcast = [&](Value value) -> Value {
     return broadcast(builder, value, shape);
@@ -267,7 +263,7 @@ static std::pair<Value, Value> frexp(ImplicitLocOpBuilder &builder, Value arg,
 // Computes exp2 for an i32 argument.
 static Value exp2I32(ImplicitLocOpBuilder &builder, Value arg) {
   assert(getElementTypeOrSelf(arg).isInteger(32) && "arg must be i32 type");
-  VectorShape shape = vectorShape(arg);
+  std::optional<VectorShape> shape = vectorShape(arg);
 
   auto bcast = [&](Value value) -> Value {
     return broadcast(builder, value, shape);
@@ -293,7 +289,7 @@ Value makePolynomialCalculation(ImplicitLocOpBuilder &builder,
   Type elementType = getElementTypeOrSelf(x);
   assert((elementType.isF32() || elementType.isF16()) &&
          "x must be f32 or f16 type");
-  VectorShape shape = vectorShape(x);
+  std::optional<VectorShape> shape = vectorShape(x);
 
   if (coeffs.empty())
     return broadcast(builder, floatCst(builder, 0.0f, elementType), shape);
@@ -391,7 +387,7 @@ AtanApproximation::matchAndRewrite(math::AtanOp op,
   if (!getElementTypeOrSelf(operand).isF32())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
-  VectorShape shape = vectorShape(op.getOperand());
+  std::optional<VectorShape> shape = vectorShape(op.getOperand());
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   Value abs = builder.create<math::AbsFOp>(operand);
@@ -490,7 +486,7 @@ Atan2Approximation::matchAndRewrite(math::Atan2Op op,
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
-  VectorShape shape = vectorShape(op.getResult());
+  std::optional<VectorShape> shape = vectorShape(op.getResult());
 
   // Compute atan in the valid range.
   auto div = builder.create<arith::DivFOp>(y, x);
@@ -556,7 +552,7 @@ TanhApproximation::matchAndRewrite(math::TanhOp op,
   if (!getElementTypeOrSelf(op.getOperand()).isF32())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
-  VectorShape shape = vectorShape(op.getOperand());
+  std::optional<VectorShape> shape = vectorShape(op.getOperand());
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   auto bcast = [&](Value value) -> Value {
@@ -644,7 +640,7 @@ LogApproximationBase<Op>::logMatchAndRewrite(Op op, PatternRewriter &rewriter,
   if (!getElementTypeOrSelf(op.getOperand()).isF32())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
-  VectorShape shape = vectorShape(op.getOperand());
+  std::optional<VectorShape> shape = vectorShape(op.getOperand());
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   auto bcast = [&](Value value) -> Value {
@@ -791,7 +787,7 @@ Log1pApproximation::matchAndRewrite(math::Log1pOp op,
   if (!getElementTypeOrSelf(op.getOperand()).isF32())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
-  VectorShape shape = vectorShape(op.getOperand());
+  std::optional<VectorShape> shape = vectorShape(op.getOperand());
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   auto bcast = [&](Value value) -> Value {
@@ -846,7 +842,7 @@ AsinPolynomialApproximation::matchAndRewrite(math::AsinOp op,
   if (!(elementType.isF32() || elementType.isF16()))
     return rewriter.notifyMatchFailure(op,
                                        "only f32 and f16 type is supported.");
-  VectorShape shape = vectorShape(operand);
+  std::optional<VectorShape> shape = vectorShape(operand);
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   auto bcast = [&](Value value) -> Value {
@@ -941,7 +937,7 @@ AcosPolynomialApproximation::matchAndRewrite(math::AcosOp op,
   if (!(elementType.isF32() || elementType.isF16()))
     return rewriter.notifyMatchFailure(op,
                                        "only f32 and f16 type is supported.");
-  VectorShape shape = vectorShape(operand);
+  std::optional<VectorShape> shape = vectorShape(operand);
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   auto bcast = [&](Value value) -> Value {
@@ -1019,7 +1015,7 @@ ErfPolynomialApproximation::matchAndRewrite(math::ErfOp op,
   if (!(elementType.isF32() || elementType.isF16()))
     return rewriter.notifyMatchFailure(op,
                                        "only f32 and f16 type is supported.");
-  VectorShape shape = vectorShape(operand);
+  std::optional<VectorShape> shape = vectorShape(operand);
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   auto bcast = [&](Value value) -> Value {
@@ -1128,8 +1124,9 @@ ErfPolynomialApproximation::matchAndRewrite(math::ErfOp op,
 
 namespace {
 
-Value clampWithNormals(ImplicitLocOpBuilder &builder, const VectorShape shape,
-                       Value value, float lowerBound, float upperBound) {
+Value clampWithNormals(ImplicitLocOpBuilder &builder,
+                       const std::optional<VectorShape> shape, Value value,
+                       float lowerBound, float upperBound) {
   assert(!std::isnan(lowerBound));
   assert(!std::isnan(upperBound));
 
@@ -1320,7 +1317,7 @@ ExpM1Approximation::matchAndRewrite(math::ExpM1Op op,
   if (!getElementTypeOrSelf(op.getOperand()).isF32())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
-  VectorShape shape = vectorShape(op.getOperand());
+  std::optional<VectorShape> shape = vectorShape(op.getOperand());
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   auto bcast = [&](Value value) -> Value {
@@ -1390,7 +1387,7 @@ LogicalResult SinAndCosApproximation<isSine, OpTy>::matchAndRewrite(
   if (!getElementTypeOrSelf(op.getOperand()).isF32())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
-  VectorShape shape = vectorShape(op.getOperand());
+  std::optional<VectorShape> shape = vectorShape(op.getOperand());
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   auto bcast = [&](Value value) -> Value {
@@ -1517,7 +1514,7 @@ CbrtApproximation::matchAndRewrite(math::CbrtOp op,
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
   ImplicitLocOpBuilder b(op->getLoc(), rewriter);
-  VectorShape shape = vectorShape(operand);
+  std::optional<VectorShape> shape = vectorShape(operand);
 
   Type floatTy = getElementTypeOrSelf(operand.getType());
   Type intTy = b.getIntegerType(floatTy.getIntOrFloatBitWidth());
@@ -1606,10 +1603,10 @@ RsqrtApproximation::matchAndRewrite(math::RsqrtOp op,
   if (!getElementTypeOrSelf(op.getOperand()).isF32())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
-  VectorShape shape = vectorShape(op.getOperand());
+  std::optional<VectorShape> shape = vectorShape(op.getOperand());
 
   // Only support already-vectorized rsqrt's.
-  if (shape.empty() || shape.sizes.back() % 8 != 0)
+  if (!shape || shape->sizes.empty() || shape->sizes.back() % 8 != 0)
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 6678878..a07593b 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -1084,13 +1084,12 @@ struct ForOpTensorCastFolder : public OpRewritePattern<ForOp> {
         continue;
 
       // Create a new ForOp with that iter operand replaced.
-      ValueTypeCastFnTy castFn = [](OpBuilder &b, Location loc, Type type,
-                                    Value source) {
-        return b.create<tensor::CastOp>(loc, type, source);
-      };
       rewriter.replaceOp(
-          op, replaceAndCastForOpIterArg(rewriter, op, iterOpOperand,
-                                         incomingCast.getSource(), castFn));
+          op, replaceAndCastForOpIterArg(
+                  rewriter, op, iterOpOperand, incomingCast.getSource(),
+                  [](OpBuilder &b, Location loc, Type type, Value source) {
+                    return b.create<tensor::CastOp>(loc, type, source);
+                  }));
       return success();
     }
     return failure();
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
index dbd573f..39d0ee1 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
@@ -878,8 +878,9 @@ OpFoldResult ReshapeOp::fold(FoldAdaptor adaptor) {
 OpFoldResult PadOp::fold(FoldAdaptor adaptor) {
   // If the pad is all zeros we can fold this operation away.
   if (adaptor.getPadding() && getInput1().getType() == getType()) {
-    auto densePad = llvm::cast<DenseElementsAttr>(adaptor.getPadding());
-    if (densePad.isSplat() && densePad.getSplatValue<APInt>().isZero()) {
+    auto densePad = llvm::dyn_cast<DenseElementsAttr>(adaptor.getPadding());
+    if (densePad && densePad.isSplat() &&
+        densePad.getSplatValue<APInt>().isZero()) {
       return getInput1();
     }
   }
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index 1d6f8a99..f169dab 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -18,6 +18,7 @@
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -37,16 +38,17 @@ using namespace mlir;
 
 /// Returns a compressed mask. The mask value is set only if any mask is present
 /// in the scale range. E.g., if `scale` equals to 2, and `intraDataOffset`
-/// equals to 2, the following mask:
+/// equals to 1 (intraDataOffset strictly smaller than scale), the following
+/// mask:
 ///
-///   %mask = [1, 1, 1, 0, 0, 0]
+///   %mask = [1, 1, 0, 0, 0, 0]
 ///
 /// will first be padded with number of `intraDataOffset` zeros:
-///   %mask = [0, 0, 1, 1, 1, 0, 0, 0]
+///   %mask = [0, 1, 1, 0, 0, 0, 0, 0]
 ///
 /// then it will return the following new compressed mask:
 ///
-///   %mask = [0, 1, 1, 0]
+///   %mask = [1, 1, 0, 0]
 static FailureOr<Operation *> getCompressedMaskOp(OpBuilder &rewriter,
                                                   Location loc, Value mask,
                                                   int origElements, int scale,
@@ -75,9 +77,6 @@ static FailureOr<Operation *> getCompressedMaskOp(OpBuilder &rewriter,
   shape.back() = numElements;
   auto newMaskType = VectorType::get(shape, rewriter.getI1Type());
   if (createMaskOp) {
-    // TODO: handle the case with non-zero intraDataOffset for CreateMaskOp.
-    if (intraDataOffset != 0)
-      return failure();
     OperandRange maskOperands = createMaskOp.getOperands();
     size_t numMaskOperands = maskOperands.size();
     AffineExpr s0;
@@ -129,26 +128,79 @@ static FailureOr<Operation *> getCompressedMaskOp(OpBuilder &rewriter,
   return newMask;
 }
 
-static Value extractSubvectorFrom(RewriterBase &rewriter, Location loc,
-                                  VectorType extractType, Value vector,
-                                  int64_t frontOffset, int64_t subvecSize) {
+/// Extracts 1-D subvector from a 1-D vector. It is a wrapper function for
+/// emitting `vector.extract_strided_slice`.
+static Value staticallyExtractSubvector(OpBuilder &rewriter, Location loc,
+                                        VectorType extractType, Value source,
+                                        int64_t frontOffset,
+                                        int64_t subvecSize) {
+  auto vectorType = cast<VectorType>(source.getType());
+  assert((vectorType.getRank() == 1 && extractType.getRank() == 1) &&
+         "expected 1-D source and destination types");
   auto offsets = rewriter.getI64ArrayAttr({frontOffset});
   auto sizes = rewriter.getI64ArrayAttr({subvecSize});
   auto strides = rewriter.getI64ArrayAttr({1});
   return rewriter
-      .create<vector::ExtractStridedSliceOp>(loc, extractType, vector, offsets,
+      .create<vector::ExtractStridedSliceOp>(loc, extractType, source, offsets,
                                              sizes, strides)
       ->getResult(0);
 }
 
-static Value insertSubvectorInto(RewriterBase &rewriter, Location loc,
-                                 Value src, Value dest, int64_t offset) {
+/// Inserts 1-D subvector into a 1-D vector by overwriting the elements starting
+/// at `offset`. it is a wrapper function for emitting
+/// `vector.insert_strided_slice`.
+static Value staticallyInsertSubvector(OpBuilder &rewriter, Location loc,
+                                       Value src, Value dest, int64_t offset) {
+  auto srcType = cast<VectorType>(src.getType());
+  auto destType = cast<VectorType>(dest.getType());
+  assert(srcType.getRank() == 1 && destType.getRank() == 1 &&
+         "expected source and dest to be vector type");
   auto offsets = rewriter.getI64ArrayAttr({offset});
   auto strides = rewriter.getI64ArrayAttr({1});
   return rewriter.create<vector::InsertStridedSliceOp>(loc, dest.getType(), src,
                                                        dest, offsets, strides);
 }
 
+/// Extracts a 1-D subvector from a 1-D `source` vector, with index at `offset`
+/// and size `numElementsToExtract`, and inserts into the `dest` vector. This
+/// function emits multiple `vector.extract` and `vector.insert` ops, so only
+/// use it when `offset` cannot be folded into a constant value.
+static Value dynamicallyExtractSubVector(OpBuilder &rewriter, Location loc,
+                                         TypedValue<VectorType> source,
+                                         Value dest, OpFoldResult offset,
+                                         int64_t numElementsToExtract) {
+  for (int i = 0; i < numElementsToExtract; ++i) {
+    Value extractLoc =
+        (i == 0) ? offset.dyn_cast<Value>()
+                 : rewriter.create<arith::AddIOp>(
+                       loc, rewriter.getIndexType(), offset.dyn_cast<Value>(),
+                       rewriter.create<arith::ConstantIndexOp>(loc, i));
+    auto extractOp =
+        rewriter.create<vector::ExtractOp>(loc, source, extractLoc);
+    dest = rewriter.create<vector::InsertOp>(loc, extractOp, dest, i);
+  }
+  return dest;
+}
+
+/// Returns the op sequence for an emulated sub-byte data type vector load.
+/// specifically, use `emulatedElemType` for loading a vector of `origElemType`.
+/// The load location is given by `base` and `linearizedIndices`, and the
+/// load size is given by `numEmulatedElementsToLoad`.
+static TypedValue<VectorType>
+emulatedVectorLoad(OpBuilder &rewriter, Location loc, Value base,
+                   OpFoldResult linearizedIndices,
+                   int64_t numEmultedElementsToLoad, Type origElemType,
+                   Type emulatedElemType) {
+  auto scale = emulatedElemType.getIntOrFloatBitWidth() /
+               origElemType.getIntOrFloatBitWidth();
+  auto newLoad = rewriter.create<vector::LoadOp>(
+      loc, VectorType::get(numEmultedElementsToLoad, emulatedElemType), base,
+      getValueOrCreateConstantIndexOp(rewriter, loc, linearizedIndices));
+  return rewriter.create<vector::BitCastOp>(
+      loc, VectorType::get(numEmultedElementsToLoad * scale, origElemType),
+      newLoad);
+};
+
 namespace {
 
 //===----------------------------------------------------------------------===//
@@ -380,25 +432,27 @@ struct ConvertVectorLoad final : OpConversionPattern<vector::LoadOp> {
             ? getConstantIntValue(linearizedInfo.intraDataOffset)
             : 0;
 
-    if (!foldedIntraVectorOffset) {
-      // unimplemented case for dynamic intra vector offset
-      return failure();
-    }
-
+    // Always load enough elements which can cover the original elements.
+    int64_t maxintraDataOffset = foldedIntraVectorOffset.value_or(scale - 1);
     auto numElements =
-        llvm::divideCeil(*foldedIntraVectorOffset + origElements, scale);
-    auto newLoad = rewriter.create<vector::LoadOp>(
-        loc, VectorType::get(numElements, newElementType), adaptor.getBase(),
-        getValueOrCreateConstantIndexOp(rewriter, loc, linearizedIndices));
-
-    Value result = rewriter.create<vector::BitCastOp>(
-        loc, VectorType::get(numElements * scale, oldElementType), newLoad);
-
-    if (isUnalignedEmulation) {
-      result = extractSubvectorFrom(rewriter, loc, op.getType(), result,
-                                    *foldedIntraVectorOffset, origElements);
+        llvm::divideCeil(maxintraDataOffset + origElements, scale);
+    Value result =
+        emulatedVectorLoad(rewriter, loc, adaptor.getBase(), linearizedIndices,
+                           numElements, oldElementType, newElementType);
+
+    if (foldedIntraVectorOffset) {
+      if (isUnalignedEmulation) {
+        result =
+            staticallyExtractSubvector(rewriter, loc, op.getType(), result,
+                                       *foldedIntraVectorOffset, origElements);
+      }
+    } else {
+      auto resultVector = rewriter.create<arith::ConstantOp>(
+          loc, op.getType(), rewriter.getZeroAttr(op.getType()));
+      result = dynamicallyExtractSubVector(
+          rewriter, loc, dyn_cast<TypedValue<VectorType>>(result), resultVector,
+          linearizedInfo.intraDataOffset, origElements);
     }
-
     rewriter.replaceOp(op, result);
     return success();
   }
@@ -513,8 +567,8 @@ struct ConvertVectorMaskedLoad final
       // create an empty vector of the new type
       auto emptyVector = rewriter.create<arith::ConstantOp>(
           loc, newBitcastType, rewriter.getZeroAttr(newBitcastType));
-      passthru = insertSubvectorInto(rewriter, loc, passthru, emptyVector,
-                                     *foldedIntraVectorOffset);
+      passthru = staticallyInsertSubvector(rewriter, loc, passthru, emptyVector,
+                                           *foldedIntraVectorOffset);
     }
     auto newPassThru =
         rewriter.create<vector::BitCastOp>(loc, loadType, passthru);
@@ -537,16 +591,17 @@ struct ConvertVectorMaskedLoad final
       // TODO: can fold if op's mask is constant
       auto emptyVector = rewriter.create<arith::ConstantOp>(
           loc, newSelectMaskType, rewriter.getZeroAttr(newSelectMaskType));
-      mask = insertSubvectorInto(rewriter, loc, op.getMask(), emptyVector,
-                                 *foldedIntraVectorOffset);
+      mask = staticallyInsertSubvector(rewriter, loc, op.getMask(), emptyVector,
+                                       *foldedIntraVectorOffset);
     }
 
     Value result =
         rewriter.create<arith::SelectOp>(loc, mask, bitCast, passthru);
 
     if (isUnalignedEmulation) {
-      result = extractSubvectorFrom(rewriter, loc, op.getType(), result,
-                                    *foldedIntraVectorOffset, origElements);
+      result =
+          staticallyExtractSubvector(rewriter, loc, op.getType(), result,
+                                     *foldedIntraVectorOffset, origElements);
     }
     rewriter.replaceOp(op, result);
 
@@ -604,13 +659,10 @@ struct ConvertVectorTransferRead final
             ? getConstantIntValue(linearizedInfo.intraDataOffset)
             : 0;
 
-    if (!foldedIntraVectorOffset) {
-      // unimplemented case for dynamic inra-vector offset
-      return failure();
-    }
-
+    auto maxIntraVectorOffset =
+        foldedIntraVectorOffset ? *foldedIntraVectorOffset : scale - 1;
     auto numElements =
-        llvm::divideCeil(*foldedIntraVectorOffset + origElements, scale);
+        llvm::divideCeil(maxIntraVectorOffset + origElements, scale);
 
     auto newRead = rewriter.create<vector::TransferReadOp>(
         loc, VectorType::get(numElements, newElementType), adaptor.getSource(),
@@ -621,9 +673,18 @@ struct ConvertVectorTransferRead final
         loc, VectorType::get(numElements * scale, oldElementType), newRead);
 
     Value result = bitCast->getResult(0);
-    if (isUnalignedEmulation) {
-      result = extractSubvectorFrom(rewriter, loc, op.getType(), result,
-                                    *foldedIntraVectorOffset, origElements);
+    if (foldedIntraVectorOffset) {
+      if (isUnalignedEmulation) {
+        result =
+            staticallyExtractSubvector(rewriter, loc, op.getType(), result,
+                                       *foldedIntraVectorOffset, origElements);
+      }
+    } else {
+      auto zeros = rewriter.create<arith::ConstantOp>(
+          loc, op.getType(), rewriter.getZeroAttr(op.getType()));
+      result = dynamicallyExtractSubVector(rewriter, loc, bitCast, zeros,
+                                           linearizedInfo.intraDataOffset,
+                                           origElements);
     }
     rewriter.replaceOp(op, result);
 
diff --git a/mlir/lib/Interfaces/InferIntRangeInterface.cpp b/mlir/lib/Interfaces/InferIntRangeInterface.cpp
index d879b93..6365851 100644
--- a/mlir/lib/Interfaces/InferIntRangeInterface.cpp
+++ b/mlir/lib/Interfaces/InferIntRangeInterface.cpp
@@ -8,6 +8,7 @@
 
 #include "mlir/Interfaces/InferIntRangeInterface.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/Interfaces/InferIntRangeInterface.cpp.inc"
 #include <optional>
 
@@ -28,6 +29,7 @@ const APInt &ConstantIntRanges::smin() const { return sminVal; }
 const APInt &ConstantIntRanges::smax() const { return smaxVal; }
 
 unsigned ConstantIntRanges::getStorageBitwidth(Type type) {
+  type = getElementTypeOrSelf(type);
   if (type.isIndex())
     return IndexType::kInternalStorageBitWidth;
   if (auto integerType = dyn_cast<IntegerType>(type))
diff --git a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
index a2acf3e..c5610ba 100644
--- a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
+++ b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
@@ -319,9 +319,8 @@ ConstantIntRanges
 mlir::intrange::inferCeilDivU(ArrayRef<ConstantIntRanges> argRanges) {
   const ConstantIntRanges &lhs = argRanges[0], &rhs = argRanges[1];
 
-  DivisionFixupFn ceilDivUIFix =
-      [](const APInt &lhs, const APInt &rhs,
-         const APInt &result) -> std::optional<APInt> {
+  auto ceilDivUIFix = [](const APInt &lhs, const APInt &rhs,
+                         const APInt &result) -> std::optional<APInt> {
     if (!lhs.urem(rhs).isZero()) {
       bool overflowed = false;
       APInt corrected =
@@ -368,9 +367,8 @@ ConstantIntRanges
 mlir::intrange::inferCeilDivS(ArrayRef<ConstantIntRanges> argRanges) {
   const ConstantIntRanges &lhs = argRanges[0], &rhs = argRanges[1];
 
-  DivisionFixupFn ceilDivSIFix =
-      [](const APInt &lhs, const APInt &rhs,
-         const APInt &result) -> std::optional<APInt> {
+  auto ceilDivSIFix = [](const APInt &lhs, const APInt &rhs,
+                         const APInt &result) -> std::optional<APInt> {
     if (!lhs.srem(rhs).isZero() && lhs.isNonNegative() == rhs.isNonNegative()) {
       bool overflowed = false;
       APInt corrected =
@@ -386,9 +384,8 @@ ConstantIntRanges
 mlir::intrange::inferFloorDivS(ArrayRef<ConstantIntRanges> argRanges) {
   const ConstantIntRanges &lhs = argRanges[0], &rhs = argRanges[1];
 
-  DivisionFixupFn floorDivSIFix =
-      [](const APInt &lhs, const APInt &rhs,
-         const APInt &result) -> std::optional<APInt> {
+  auto floorDivSIFix = [](const APInt &lhs, const APInt &rhs,
+                          const APInt &result) -> std::optional<APInt> {
     if (!lhs.srem(rhs).isZero() && lhs.isNonNegative() != rhs.isNonNegative()) {
       bool overflowed = false;
       APInt corrected =
@@ -603,8 +600,7 @@ ConstantIntRanges
 mlir::intrange::inferShrS(ArrayRef<ConstantIntRanges> argRanges) {
   const ConstantIntRanges &lhs = argRanges[0], &rhs = argRanges[1];
 
-  ConstArithFn ashr = [](const APInt &l,
-                         const APInt &r) -> std::optional<APInt> {
+  auto ashr = [](const APInt &l, const APInt &r) -> std::optional<APInt> {
     return r.uge(r.getBitWidth()) ? std::optional<APInt>() : l.ashr(r);
   };
 
@@ -616,8 +612,7 @@ ConstantIntRanges
 mlir::intrange::inferShrU(ArrayRef<ConstantIntRanges> argRanges) {
   const ConstantIntRanges &lhs = argRanges[0], &rhs = argRanges[1];
 
-  ConstArithFn lshr = [](const APInt &l,
-                         const APInt &r) -> std::optional<APInt> {
+  auto lshr = [](const APInt &l, const APInt &r) -> std::optional<APInt> {
     return r.uge(r.getBitWidth()) ? std::optional<APInt>() : l.lshr(r);
   };
   return minMaxBy(lshr, {lhs.umin(), lhs.umax()}, {rhs.umin(), rhs.umax()},
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index dca29f5..d9461a5 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -145,7 +145,9 @@ static omp::PrivateClauseOp findPrivatizer(Operation *from,
 ///          given operation.
 static LogicalResult checkImplementationStatus(Operation &op) {
   auto todo = [&op](StringRef clauseName) {
-    return op.emitError(clauseName + " clause not yet supported");
+    return op.emitError() << "not yet implemented: Unhandled clause "
+                          << clauseName << " in " << op.getName()
+                          << " operation";
   };
 
   auto checkAligned = [&todo](auto op, LogicalResult &result) {
@@ -306,8 +308,8 @@ static LogicalResult checkImplementationStatus(Operation &op) {
               result = todo("firstprivate");
 
             if (!privatizer.getDeallocRegion().empty())
-              result =
-                  op.emitError("privatization of structures not yet supported");
+              result = op.emitError("not yet implemented: privatization of "
+                                    "structures in omp.target operation");
           }
         }
         checkThreadLimit(op, result);
@@ -4095,8 +4097,7 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder,
             return success();
           })
       .Default([&](Operation *inst) {
-        return inst->emitError("unsupported OpenMP operation: ")
-               << inst->getName();
+        return inst->emitError() << "not yet implemented: " << inst->getName();
       });
 }
 
diff --git a/mlir/test/Conversion/AffineToStandard/lower-affine.mlir b/mlir/test/Conversion/AffineToStandard/lower-affine.mlir
index 3781d51..3be4266 100644
--- a/mlir/test/Conversion/AffineToStandard/lower-affine.mlir
+++ b/mlir/test/Conversion/AffineToStandard/lower-affine.mlir
@@ -976,3 +976,20 @@ func.func @test_dilinearize_index(%linear_index: index) -> (index, index, index)
 // CHECK:           %[[VAL_38:.*]] = arith.select %[[VAL_36]], %[[VAL_37]], %[[VAL_34]] : index
 // CHECK:           return %[[VAL_11]], %[[VAL_32]], %[[VAL_38]] : index, index, index
 // CHECK:         }
+
+/////////////////////////////////////////////////////////////////////
+
+func.func @test_linearize_index(%arg0: index, %arg1: index, %arg2: index) -> index {
+  %ret = affine.linearize_index disjoint [%arg0, %arg1, %arg2] by (2, 3, 5) : index
+  return %ret : index
+}
+
+// CHECK-LABEL: @test_linearize_index
+// CHECK-SAME: (%[[arg0:.+]]: index, %[[arg1:.+]]: index, %[[arg2:.+]]: index)
+// CHECK: %[[c15:.+]] = arith.constant 15 : index
+// CHECK-NEXT: %[[tmp0:.+]] = arith.muli %[[arg0]], %[[c15]] : index
+// CHECK-NEXT: %[[c5:.+]] = arith.constant 5 : index
+// CHECK-NEXT: %[[tmp1:.+]] = arith.muli %[[arg1]], %[[c5]] : index
+// CHECK-NEXT: %[[tmp2:.+]] = arith.addi %[[tmp0]], %[[tmp1]] : index
+// CHECK-NEXT: %[[ret:.+]] = arith.addi %[[tmp2]], %[[arg2]] : index
+// CHECK-NEXT: return %[[ret]]
diff --git a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
index e496c66..4f37dd1 100644
--- a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
+++ b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
@@ -120,7 +120,7 @@ func.func @atomic_write(%a: !llvm.ptr) -> () {
 // CHECK: (%[[ARG0:.*]]: !llvm.ptr, %[[ARG1:.*]]: !llvm.ptr)
 // CHECK: omp.atomic.read %[[ARG1]] = %[[ARG0]] hint(contended) memory_order(acquire) : !llvm.ptr
 func.func @atomic_read(%a: !llvm.ptr, %b: !llvm.ptr) -> () {
-  omp.atomic.read %b = %a memory_order(acquire) hint(contended) : !llvm.ptr, i32
+  omp.atomic.read %b = %a memory_order(acquire) hint(contended) : !llvm.ptr, !llvm.ptr, i32
   return
 }
 
diff --git a/mlir/test/Dialect/Affine/affine-expand-index-ops.mlir b/mlir/test/Dialect/Affine/affine-expand-index-ops.mlir
index 9577320..ded1687 100644
--- a/mlir/test/Dialect/Affine/affine-expand-index-ops.mlir
+++ b/mlir/test/Dialect/Affine/affine-expand-index-ops.mlir
@@ -41,3 +41,29 @@ func.func @dynamic_basis(%linear_index: index, %src: memref<?x?x?xf32>) -> (inde
   %1:3 = affine.delinearize_index %linear_index into (%b0, %b1, %b2) : index, index, index
   return %1#0, %1#1, %1#2 : index, index, index
 }
+
+// -----
+
+// CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0, s1, s2] -> (s0 * 15 + s1 * 5 + s2)>
+
+// CHECK-LABEL: @linearize_static
+// CHECK-SAME: (%[[arg0:.+]]: index, %[[arg1:.+]]: index, %[[arg2:.+]]: index)
+// CHECK: %[[val_0:.+]] = affine.apply #[[$map0]]()[%[[arg0]], %[[arg1]], %[[arg2]]]
+// CHECK: return %[[val_0]]
+func.func @linearize_static(%arg0: index, %arg1: index, %arg2: index) -> index {
+  %0 = affine.linearize_index [%arg0, %arg1, %arg2] by (2, 3, 5) : index
+  func.return %0 : index
+}
+
+// -----
+
+// CHECK-DAG: #[[$map0:.+]] =  affine_map<()[s0, s1, s2, s3, s4] -> (s1 * s2 + s3 + s0 * (s2 * s4))>
+
+// CHECK-LABEL: @linearize_dynamic
+// CHECK-SAME: (%[[arg0:.+]]: index, %[[arg1:.+]]: index, %[[arg2:.+]]: index, %[[arg3:.+]]: index, %[[arg4:.+]]: index, %[[arg5:.+]]: index)
+// CHECK: %[[val_0:.+]] = affine.apply #[[$map0]]()[%[[arg0]], %[[arg1]], %[[arg5]], %[[arg2]], %[[arg4]]]
+// CHECK: return %[[val_0]]
+func.func @linearize_dynamic(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> index {
+  %0 = affine.linearize_index [%arg0, %arg1, %arg2] by (%arg3, %arg4, %arg5) : index
+  func.return %0 : index
+}
diff --git a/mlir/test/Dialect/Affine/canonicalize.mlir b/mlir/test/Dialect/Affine/canonicalize.mlir
index d78c3b6..f6007aa 100644
--- a/mlir/test/Dialect/Affine/canonicalize.mlir
+++ b/mlir/test/Dialect/Affine/canonicalize.mlir
@@ -1517,6 +1517,7 @@ func.func @drop_single_loop_delinearize(%arg0 : index, %arg1 : index) -> index {
 // -----
 
 // CHECK-LABEL: func @delinearize_non_induction_variable
+// CHECK-NOT: affine.delinearize
 func.func @delinearize_non_induction_variable(%arg0: memref<?xi32>, %i : index, %t0 : index, %t1 : index, %t2 : index) -> index {
   %1 = affine.apply affine_map<(d0)[s0, s1, s2] -> (d0 + s0 + s1 * 64 + s2 * 128)>(%i)[%t0, %t1, %t2]
   %2 = affine.delinearize_index %1 into (1024) : index
@@ -1526,7 +1527,42 @@ func.func @delinearize_non_induction_variable(%arg0: memref<?xi32>, %i : index,
 // -----
 
 // CHECK-LABEL: func @delinearize_non_loop_like
+// CHECK-NOT: affine.delinearize
 func.func @delinearize_non_loop_like(%arg0: memref<?xi32>, %i : index) -> index {
   %2 = affine.delinearize_index %i into (1024) : index
   return %2 : index
 }
+
+// -----
+
+// CHECK-LABEL: @linearize_unit_basis_disjoint
+// CHECK-SAME: (%[[arg0:.+]]: index, %[[arg1:.+]]: index, %[[arg2:.+]]: index, %[[arg3:.+]]: index)
+// CHECK: %[[ret:.+]] = affine.linearize_index disjoint [%[[arg0]], %[[arg2]]] by (3, %[[arg3]]) : index
+// CHECK: return %[[ret]]
+func.func @linearize_unit_basis_disjoint(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> index {
+  %ret = affine.linearize_index disjoint [%arg0, %arg1, %arg2] by (3, 1, %arg3) : index
+  return %ret : index
+}
+
+// -----
+
+// CHECK-LABEL: @linearize_unit_basis_zero
+// CHECK-SAME: (%[[arg0:.+]]: index, %[[arg1:.+]]: index, %[[arg2:.+]]: index)
+// CHECK: %[[ret:.+]] = affine.linearize_index [%[[arg0]], %[[arg1]]] by (3, %[[arg2]]) : index
+// CHECK: return %[[ret]]
+func.func @linearize_unit_basis_zero(%arg0: index, %arg1: index, %arg2: index) -> index {
+  %c0 = arith.constant 0 : index
+  %ret = affine.linearize_index [%arg0, %c0, %arg1] by (3, 1, %arg2) : index
+  return %ret : index
+}
+
+// -----
+
+// CHECK-LABEL: @linearize_all_zero_unit_basis
+// CHECK: arith.constant 0 : index
+// CHECK-NOT: affine.linearize_index
+func.func @linearize_all_zero_unit_basis() -> index {
+  %c0 = arith.constant 0 : index
+  %ret = affine.linearize_index [%c0, %c0] by (1, 1) : index
+  return %ret : index
+}
diff --git a/mlir/test/Dialect/Affine/invalid.mlir b/mlir/test/Dialect/Affine/invalid.mlir
index 869ea71..2996194 100644
--- a/mlir/test/Dialect/Affine/invalid.mlir
+++ b/mlir/test/Dialect/Affine/invalid.mlir
@@ -548,6 +548,22 @@ func.func @delinearize(%idx: index, %basis0: index, %basis1 :index) {
 
 // -----
 
+func.func @linearize(%idx: index, %basis0: index, %basis1 :index) -> index {
+  // expected-error@+1 {{'affine.linearize_index' op should be passed an index for each basis element}}
+  %0 = affine.linearize_index [%idx] by (%basis0, %basis1) : index
+  return %0 : index
+}
+
+// -----
+
+func.func @linearize_empty() -> index {
+  // expected-error@+1 {{'affine.linearize_index' op basis should not be empty}}
+  %0 = affine.linearize_index [] by () : index
+  return %0 : index
+}
+
+// -----
+
 func.func @dynamic_dimension_index() {
   "unknown.region"() ({
     %idx = "unknown.test"() : () -> (index)
diff --git a/mlir/test/Dialect/Affine/ops.mlir b/mlir/test/Dialect/Affine/ops.mlir
index 52ae53a..1d1db5f 100644
--- a/mlir/test/Dialect/Affine/ops.mlir
+++ b/mlir/test/Dialect/Affine/ops.mlir
@@ -282,3 +282,19 @@ func.func @delinearize_mixed(%linear_idx: index, %basis1: index) -> (index, inde
   %1:3 = affine.delinearize_index %linear_idx into (2, %basis1, 3) : index, index, index
   return %1#0, %1#1, %1#2 : index, index, index
 }
+
+// -----
+
+// CHECK-LABEL: func @linearize
+func.func @linearize(%index0: index, %index1: index, %basis0: index, %basis1 :index) -> index {
+  // CHECK: affine.linearize_index [%{{.+}}, %{{.+}}] by (%{{.+}}, %{{.+}}) : index
+  %1 = affine.linearize_index [%index0, %index1] by (%basis0, %basis1) : index
+  return %1 : index
+}
+
+// CHECK-LABEL: @linearize_mixed
+func.func @linearize_mixed(%index0: index, %index1: index, %index2: index, %basis1: index) -> index {
+  // CHECK: affine.linearize_index disjoint [%{{.+}}, %{{.+}}, %{{.+}}] by (2, %{{.+}}, 3) : index
+  %1 = affine.linearize_index disjoint [%index0, %index1, %index2] by (2, %basis1, 3) : index
+  return %1 : index
+}
diff --git a/mlir/test/Dialect/Arith/int-narrowing-invalid-options.mlir b/mlir/test/Dialect/Arith/int-narrowing-invalid-options.mlir
deleted file mode 100644
index 0e34108..0000000
--- a/mlir/test/Dialect/Arith/int-narrowing-invalid-options.mlir
+++ /dev/null
@@ -1,16 +0,0 @@
-// RUN: not mlir-opt %s --arith-int-narrowing --mlir-print-ir-after-failure 2>&1 \
-// RUN:   | FileCheck %s
-
-// RUN: not mlir-opt %s --arith-int-narrowing="int-bitwidths-supported=0" \
-// RUN:   --mlir-print-ir-after-failure 2>&1 | FileCheck %s
-
-// Make sure we do not crash on invalid pass options.
-
-// CHECK:       IR Dump After ArithIntNarrowing Failed (arith-int-narrowing)
-// CHECK-LABEL: func.func @addi_extsi_i8
-func.func @addi_extsi_i8(%lhs: i8, %rhs: i8) -> i32 {
-  %a = arith.extsi %lhs : i8 to i32
-  %b = arith.extsi %rhs : i8 to i32
-  %r = arith.addi %a, %b : i32
-  return %r : i32
-}
diff --git a/mlir/test/Dialect/Arith/int-narrowing.mlir b/mlir/test/Dialect/Arith/int-narrowing.mlir
deleted file mode 100644
index 153c0a8..0000000
--- a/mlir/test/Dialect/Arith/int-narrowing.mlir
+++ /dev/null
@@ -1,997 +0,0 @@
-// RUN: mlir-opt --arith-int-narrowing="int-bitwidths-supported=1,8,16,24,32" \
-// RUN:          --verify-diagnostics %s | FileCheck %s
-
-//===----------------------------------------------------------------------===//
-// arith.addi
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: func.func @addi_extsi_i8
-// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
-// CHECK-NEXT:    %[[EXT0:.+]] = arith.extsi %[[ARG0]] : i8 to i32
-// CHECK-NEXT:    %[[EXT1:.+]] = arith.extsi %[[ARG1]] : i8 to i32
-// CHECK-NEXT:    %[[LHS:.+]]  = arith.trunci %[[EXT0]] : i32 to i16
-// CHECK-NEXT:    %[[RHS:.+]]  = arith.trunci %[[EXT1]] : i32 to i16
-// CHECK-NEXT:    %[[ADD:.+]]  = arith.addi %[[LHS]], %[[RHS]] : i16
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extsi %[[ADD]] : i16 to i32
-// CHECK-NEXT:    return %[[RET]] : i32
-func.func @addi_extsi_i8(%lhs: i8, %rhs: i8) -> i32 {
-  %a = arith.extsi %lhs : i8 to i32
-  %b = arith.extsi %rhs : i8 to i32
-  %r = arith.addi %a, %b : i32
-  return %r : i32
-}
-
-// CHECK-LABEL: func.func @addi_extui_i8
-// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
-// CHECK-NEXT:    %[[EXT0:.+]] = arith.extui %[[ARG0]] : i8 to i32
-// CHECK-NEXT:    %[[EXT1:.+]] = arith.extui %[[ARG1]] : i8 to i32
-// CHECK-NEXT:    %[[LHS:.+]]  = arith.trunci %[[EXT0]] : i32 to i16
-// CHECK-NEXT:    %[[RHS:.+]]  = arith.trunci %[[EXT1]] : i32 to i16
-// CHECK-NEXT:    %[[ADD:.+]]  = arith.addi %[[LHS]], %[[RHS]] : i16
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extui %[[ADD]] : i16 to i32
-// CHECK-NEXT:    return %[[RET]] : i32
-func.func @addi_extui_i8(%lhs: i8, %rhs: i8) -> i32 {
-  %a = arith.extui %lhs : i8 to i32
-  %b = arith.extui %rhs : i8 to i32
-  %r = arith.addi %a, %b : i32
-  return %r : i32
-}
-
-// arith.addi produces one more bit of result than the operand bitwidth.
-//
-// CHECK-LABEL: func.func @addi_extsi_i24
-// CHECK-SAME:    (%[[ARG0:.+]]: i16, %[[ARG1:.+]]: i16)
-// CHECK-NEXT:    %[[EXT0:.+]] = arith.extsi %[[ARG0]] : i16 to i32
-// CHECK-NEXT:    %[[EXT1:.+]] = arith.extsi %[[ARG1]] : i16 to i32
-// CHECK-NEXT:    %[[LHS:.+]]  = arith.trunci %[[EXT0]] : i32 to i24
-// CHECK-NEXT:    %[[RHS:.+]]  = arith.trunci %[[EXT1]] : i32 to i24
-// CHECK-NEXT:    %[[ADD:.+]]  = arith.addi %[[LHS]], %[[RHS]] : i24
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extsi %[[ADD]] : i24 to i32
-// CHECK-NEXT:    return %[[RET]] : i32
-func.func @addi_extsi_i24(%lhs: i16, %rhs: i16) -> i32 {
-  %a = arith.extsi %lhs : i16 to i32
-  %b = arith.extsi %rhs : i16 to i32
-  %r = arith.addi %a, %b : i32
-  return %r : i32
-}
-
-// This case should not get optimized because of mixed extensions.
-//
-// CHECK-LABEL: func.func @addi_mixed_ext_i8
-// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
-// CHECK-NEXT:    %[[EXT0:.+]] = arith.extsi %[[ARG0]] : i8 to i32
-// CHECK-NEXT:    %[[EXT1:.+]] = arith.extui %[[ARG1]] : i8 to i32
-// CHECK-NEXT:    %[[ADD:.+]]  = arith.addi %[[EXT0]], %[[EXT1]] : i32
-// CHECK-NEXT:    return %[[ADD]] : i32
-func.func @addi_mixed_ext_i8(%lhs: i8, %rhs: i8) -> i32 {
-  %a = arith.extsi %lhs : i8 to i32
-  %b = arith.extui %rhs : i8 to i32
-  %r = arith.addi %a, %b : i32
-  return %r : i32
-}
-
-// This case should not get optimized because we cannot reduce the bitwidth
-// below i16, given the pass options set.
-//
-// CHECK-LABEL: func.func @addi_extsi_i16
-// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
-// CHECK-NEXT:    %[[EXT0:.+]] = arith.extsi %[[ARG0]] : i8 to i16
-// CHECK-NEXT:    %[[EXT1:.+]] = arith.extsi %[[ARG1]] : i8 to i16
-// CHECK-NEXT:    %[[ADD:.+]]  = arith.addi %[[EXT0]], %[[EXT1]] : i16
-// CHECK-NEXT:    return %[[ADD]] : i16
-func.func @addi_extsi_i16(%lhs: i8, %rhs: i8) -> i16 {
-  %a = arith.extsi %lhs : i8 to i16
-  %b = arith.extsi %rhs : i8 to i16
-  %r = arith.addi %a, %b : i16
-  return %r : i16
-}
-
-// CHECK-LABEL: func.func @addi_extsi_3xi8_cst
-// CHECK-SAME:    (%[[ARG0:.+]]: vector<3xi8>)
-// CHECK-NEXT:    %[[CST:.+]]  = arith.constant dense<[-1, 127, 42]> : vector<3xi16>
-// CHECK-NEXT:    %[[EXT:.+]]  = arith.extsi %[[ARG0]] : vector<3xi8> to vector<3xi32>
-// CHECK-NEXT:    %[[LHS:.+]]  = arith.trunci %[[EXT]] : vector<3xi32> to vector<3xi16>
-// CHECK-NEXT:    %[[ADD:.+]]  = arith.addi %[[LHS]], %[[CST]] : vector<3xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extsi %[[ADD]] : vector<3xi16> to vector<3xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<3xi32>
-func.func @addi_extsi_3xi8_cst(%lhs: vector<3xi8>) -> vector<3xi32> {
-  %cst = arith.constant dense<[-1, 127, 42]> : vector<3xi32>
-  %a = arith.extsi %lhs : vector<3xi8> to vector<3xi32>
-  %r = arith.addi %a, %cst : vector<3xi32>
-  return %r : vector<3xi32>
-}
-
-//===----------------------------------------------------------------------===//
-// arith.subi
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: func.func @subi_extsi_i8
-// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
-// CHECK-NEXT:    %[[EXT0:.+]] = arith.extsi %[[ARG0]] : i8 to i32
-// CHECK-NEXT:    %[[EXT1:.+]] = arith.extsi %[[ARG1]] : i8 to i32
-// CHECK-NEXT:    %[[LHS:.+]]  = arith.trunci %[[EXT0]] : i32 to i16
-// CHECK-NEXT:    %[[RHS:.+]]  = arith.trunci %[[EXT1]] : i32 to i16
-// CHECK-NEXT:    %[[SUB:.+]]  = arith.subi %[[LHS]], %[[RHS]] : i16
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extsi %[[SUB]] : i16 to i32
-// CHECK-NEXT:    return %[[RET]] : i32
-func.func @subi_extsi_i8(%lhs: i8, %rhs: i8) -> i32 {
-  %a = arith.extsi %lhs : i8 to i32
-  %b = arith.extsi %rhs : i8 to i32
-  %r = arith.subi %a, %b : i32
-  return %r : i32
-}
-
-// This patterns should only apply to `arith.subi` ops with sign-extended
-// arguments.
-//
-// CHECK-LABEL: func.func @subi_extui_i8
-// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
-// CHECK-NEXT:    %[[EXT0:.+]] = arith.extui %[[ARG0]] : i8 to i32
-// CHECK-NEXT:    %[[EXT1:.+]] = arith.extui %[[ARG1]] : i8 to i32
-// CHECK-NEXT:    %[[SUB:.+]]  = arith.subi %[[EXT0]], %[[EXT1]] : i32
-// CHECK-NEXT:    return %[[SUB]] : i32
-func.func @subi_extui_i8(%lhs: i8, %rhs: i8) -> i32 {
-  %a = arith.extui %lhs : i8 to i32
-  %b = arith.extui %rhs : i8 to i32
-  %r = arith.subi %a, %b : i32
-  return %r : i32
-}
-
-// This case should not get optimized because of mixed extensions.
-//
-// CHECK-LABEL: func.func @subi_mixed_ext_i8
-// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
-// CHECK-NEXT:    %[[EXT0:.+]] = arith.extsi %[[ARG0]] : i8 to i32
-// CHECK-NEXT:    %[[EXT1:.+]] = arith.extui %[[ARG1]] : i8 to i32
-// CHECK-NEXT:    %[[ADD:.+]]  = arith.subi %[[EXT0]], %[[EXT1]] : i32
-// CHECK-NEXT:    return %[[ADD]] : i32
-func.func @subi_mixed_ext_i8(%lhs: i8, %rhs: i8) -> i32 {
-  %a = arith.extsi %lhs : i8 to i32
-  %b = arith.extui %rhs : i8 to i32
-  %r = arith.subi %a, %b : i32
-  return %r : i32
-}
-
-// arith.subi produces one more bit of result than the operand bitwidth.
-//
-// CHECK-LABEL: func.func @subi_extsi_i24
-// CHECK-SAME:    (%[[ARG0:.+]]: i16, %[[ARG1:.+]]: i16)
-// CHECK-NEXT:    %[[EXT0:.+]] = arith.extsi %[[ARG0]] : i16 to i32
-// CHECK-NEXT:    %[[EXT1:.+]] = arith.extsi %[[ARG1]] : i16 to i32
-// CHECK-NEXT:    %[[LHS:.+]]  = arith.trunci %[[EXT0]] : i32 to i24
-// CHECK-NEXT:    %[[RHS:.+]]  = arith.trunci %[[EXT1]] : i32 to i24
-// CHECK-NEXT:    %[[ADD:.+]]  = arith.subi %[[LHS]], %[[RHS]] : i24
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extsi %[[ADD]] : i24 to i32
-// CHECK-NEXT:    return %[[RET]] : i32
-func.func @subi_extsi_i24(%lhs: i16, %rhs: i16) -> i32 {
-  %a = arith.extsi %lhs : i16 to i32
-  %b = arith.extsi %rhs : i16 to i32
-  %r = arith.subi %a, %b : i32
-  return %r : i32
-}
-
-//===----------------------------------------------------------------------===//
-// arith.muli
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: func.func @muli_extsi_i8
-// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
-// CHECK-NEXT:    %[[EXT0:.+]] = arith.extsi %[[ARG0]] : i8 to i32
-// CHECK-NEXT:    %[[EXT1:.+]] = arith.extsi %[[ARG1]] : i8 to i32
-// CHECK-NEXT:    %[[LHS:.+]]  = arith.trunci %[[EXT0]] : i32 to i16
-// CHECK-NEXT:    %[[RHS:.+]]  = arith.trunci %[[EXT1]] : i32 to i16
-// CHECK-NEXT:    %[[MUL:.+]]  = arith.muli %[[LHS]], %[[RHS]] : i16
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extsi %[[MUL]] : i16 to i32
-// CHECK-NEXT:    return %[[RET]] : i32
-func.func @muli_extsi_i8(%lhs: i8, %rhs: i8) -> i32 {
-  %a = arith.extsi %lhs : i8 to i32
-  %b = arith.extsi %rhs : i8 to i32
-  %r = arith.muli %a, %b : i32
-  return %r : i32
-}
-
-// CHECK-LABEL: func.func @muli_extui_i8
-// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
-// CHECK-NEXT:    %[[EXT0:.+]] = arith.extui %[[ARG0]] : i8 to i32
-// CHECK-NEXT:    %[[EXT1:.+]] = arith.extui %[[ARG1]] : i8 to i32
-// CHECK-NEXT:    %[[LHS:.+]]  = arith.trunci %[[EXT0]] : i32 to i16
-// CHECK-NEXT:    %[[RHS:.+]]  = arith.trunci %[[EXT1]] : i32 to i16
-// CHECK-NEXT:    %[[MUL:.+]]  = arith.muli %[[LHS]], %[[RHS]] : i16
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extui %[[MUL]] : i16 to i32
-// CHECK-NEXT:    return %[[RET]] : i32
-func.func @muli_extui_i8(%lhs: i8, %rhs: i8) -> i32 {
-  %a = arith.extui %lhs : i8 to i32
-  %b = arith.extui %rhs : i8 to i32
-  %r = arith.muli %a, %b : i32
-  return %r : i32
-}
-
-// We do not expect this case to be optimized because given n-bit operands,
-// arith.muli produces 2n bits of result.
-//
-// CHECK-LABEL: func.func @muli_extsi_i32
-// CHECK-SAME:    (%[[ARG0:.+]]: i16, %[[ARG1:.+]]: i16)
-// CHECK-NEXT:    %[[LHS:.+]]  = arith.extsi %[[ARG0]] : i16 to i32
-// CHECK-NEXT:    %[[RHS:.+]]  = arith.extsi %[[ARG1]] : i16 to i32
-// CHECK-NEXT:    %[[RET:.+]]  = arith.muli %[[LHS]], %[[RHS]] : i32
-// CHECK-NEXT:    return %[[RET]] : i32
-func.func @muli_extsi_i32(%lhs: i16, %rhs: i16) -> i32 {
-  %a = arith.extsi %lhs : i16 to i32
-  %b = arith.extsi %rhs : i16 to i32
-  %r = arith.muli %a, %b : i32
-  return %r : i32
-}
-
-// This case should not get optimized because of mixed extensions.
-//
-// CHECK-LABEL: func.func @muli_mixed_ext_i8
-// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
-// CHECK-NEXT:    %[[EXT0:.+]] = arith.extsi %[[ARG0]] : i8 to i32
-// CHECK-NEXT:    %[[EXT1:.+]] = arith.extui %[[ARG1]] : i8 to i32
-// CHECK-NEXT:    %[[MUL:.+]]  = arith.muli %[[EXT0]], %[[EXT1]] : i32
-// CHECK-NEXT:    return %[[MUL]] : i32
-func.func @muli_mixed_ext_i8(%lhs: i8, %rhs: i8) -> i32 {
-  %a = arith.extsi %lhs : i8 to i32
-  %b = arith.extui %rhs : i8 to i32
-  %r = arith.muli %a, %b : i32
-  return %r : i32
-}
-
-// CHECK-LABEL: func.func @muli_extsi_3xi8_cst
-// CHECK-SAME:    (%[[ARG0:.+]]: vector<3xi8>)
-// CHECK-NEXT:    %[[CST:.+]]  = arith.constant dense<[-1, 127, 42]> : vector<3xi16>
-// CHECK-NEXT:    %[[EXT:.+]]  = arith.extsi %[[ARG0]] : vector<3xi8> to vector<3xi32>
-// CHECK-NEXT:    %[[LHS:.+]]  = arith.trunci %[[EXT]] : vector<3xi32> to vector<3xi16>
-// CHECK-NEXT:    %[[MUL:.+]]  = arith.muli %[[LHS]], %[[CST]] : vector<3xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extsi %[[MUL]] : vector<3xi16> to vector<3xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<3xi32>
-func.func @muli_extsi_3xi8_cst(%lhs: vector<3xi8>) -> vector<3xi32> {
-  %cst = arith.constant dense<[-1, 127, 42]> : vector<3xi32>
-  %a = arith.extsi %lhs : vector<3xi8> to vector<3xi32>
-  %r = arith.muli %a, %cst : vector<3xi32>
-  return %r : vector<3xi32>
-}
-
-//===----------------------------------------------------------------------===//
-// arith.divsi
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: func.func @divsi_extsi_i8
-// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
-// CHECK-NEXT:    %[[EXT0:.+]] = arith.extsi %[[ARG0]] : i8 to i32
-// CHECK-NEXT:    %[[EXT1:.+]] = arith.extsi %[[ARG1]] : i8 to i32
-// CHECK-NEXT:    %[[LHS:.+]]  = arith.trunci %[[EXT0]] : i32 to i16
-// CHECK-NEXT:    %[[RHS:.+]]  = arith.trunci %[[EXT1]] : i32 to i16
-// CHECK-NEXT:    %[[SUB:.+]]  = arith.divsi %[[LHS]], %[[RHS]] : i16
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extsi %[[SUB]] : i16 to i32
-// CHECK-NEXT:    return %[[RET]] : i32
-func.func @divsi_extsi_i8(%lhs: i8, %rhs: i8) -> i32 {
-  %a = arith.extsi %lhs : i8 to i32
-  %b = arith.extsi %rhs : i8 to i32
-  %r = arith.divsi %a, %b : i32
-  return %r : i32
-}
-
-// This patterns should only apply to `arith.divsi` ops with sign-extended
-// arguments.
-//
-// CHECK-LABEL: func.func @divsi_extui_i8
-// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
-// CHECK-NEXT:    %[[EXT0:.+]] = arith.extui %[[ARG0]] : i8 to i32
-// CHECK-NEXT:    %[[EXT1:.+]] = arith.extui %[[ARG1]] : i8 to i32
-// CHECK-NEXT:    %[[SUB:.+]]  = arith.divsi %[[EXT0]], %[[EXT1]] : i32
-// CHECK-NEXT:    return %[[SUB]] : i32
-func.func @divsi_extui_i8(%lhs: i8, %rhs: i8) -> i32 {
-  %a = arith.extui %lhs : i8 to i32
-  %b = arith.extui %rhs : i8 to i32
-  %r = arith.divsi %a, %b : i32
-  return %r : i32
-}
-
-// arith.divsi produces one more bit of result than the operand bitwidth.
-//
-// CHECK-LABEL: func.func @divsi_extsi_i24
-// CHECK-SAME:    (%[[ARG0:.+]]: i16, %[[ARG1:.+]]: i16)
-// CHECK-NEXT:    %[[EXT0:.+]] = arith.extsi %[[ARG0]] : i16 to i32
-// CHECK-NEXT:    %[[EXT1:.+]] = arith.extsi %[[ARG1]] : i16 to i32
-// CHECK-NEXT:    %[[LHS:.+]]  = arith.trunci %[[EXT0]] : i32 to i24
-// CHECK-NEXT:    %[[RHS:.+]]  = arith.trunci %[[EXT1]] : i32 to i24
-// CHECK-NEXT:    %[[ADD:.+]]  = arith.divsi %[[LHS]], %[[RHS]] : i24
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extsi %[[ADD]] : i24 to i32
-// CHECK-NEXT:    return %[[RET]] : i32
-func.func @divsi_extsi_i24(%lhs: i16, %rhs: i16) -> i32 {
-  %a = arith.extsi %lhs : i16 to i32
-  %b = arith.extsi %rhs : i16 to i32
-  %r = arith.divsi %a, %b : i32
-  return %r : i32
-}
-
-//===----------------------------------------------------------------------===//
-// arith.divui
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: func.func @divui_extui_i8
-// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
-// CHECK-NEXT:    %[[SUB:.+]]  = arith.divui %[[ARG0]], %[[ARG1]] : i8
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extui %[[SUB]] : i8 to i32
-// CHECK-NEXT:    return %[[RET]] : i32
-func.func @divui_extui_i8(%lhs: i8, %rhs: i8) -> i32 {
-  %a = arith.extui %lhs : i8 to i32
-  %b = arith.extui %rhs : i8 to i32
-  %r = arith.divui %a, %b : i32
-  return %r : i32
-}
-
-// This patterns should only apply to `arith.divui` ops with zero-extended
-// arguments.
-//
-// CHECK-LABEL: func.func @divui_extsi_i8
-// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
-// CHECK-NEXT:    %[[EXT0:.+]] = arith.extsi %[[ARG0]] : i8 to i32
-// CHECK-NEXT:    %[[EXT1:.+]] = arith.extsi %[[ARG1]] : i8 to i32
-// CHECK-NEXT:    %[[SUB:.+]]  = arith.divui %[[EXT0]], %[[EXT1]] : i32
-// CHECK-NEXT:    return %[[SUB]] : i32
-func.func @divui_extsi_i8(%lhs: i8, %rhs: i8) -> i32 {
-  %a = arith.extsi %lhs : i8 to i32
-  %b = arith.extsi %rhs : i8 to i32
-  %r = arith.divui %a, %b : i32
-  return %r : i32
-}
-
-//===----------------------------------------------------------------------===//
-// arith.*itofp
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: func.func @sitofp_extsi_i16
-// CHECK-SAME:    (%[[ARG:.+]]: i16)
-// CHECK-NEXT:    %[[RET:.+]] = arith.sitofp %[[ARG]] : i16 to f16
-// CHECK-NEXT:    return %[[RET]] : f16
-func.func @sitofp_extsi_i16(%a: i16) -> f16 {
-  %b = arith.extsi %a : i16 to i32
-  %f = arith.sitofp %b : i32 to f16
-  return %f : f16
-}
-
-// CHECK-LABEL: func.func @sitofp_extsi_vector_i16
-// CHECK-SAME:    (%[[ARG:.+]]: vector<3xi16>)
-// CHECK-NEXT:    %[[RET:.+]] = arith.sitofp %[[ARG]] : vector<3xi16> to vector<3xf16>
-// CHECK-NEXT:    return %[[RET]] : vector<3xf16>
-func.func @sitofp_extsi_vector_i16(%a: vector<3xi16>) -> vector<3xf16> {
-  %b = arith.extsi %a : vector<3xi16> to vector<3xi32>
-  %f = arith.sitofp %b : vector<3xi32> to vector<3xf16>
-  return %f : vector<3xf16>
-}
-
-// CHECK-LABEL: func.func @sitofp_extsi_tensor_i16
-// CHECK-SAME:    (%[[ARG:.+]]: tensor<3x?xi16>)
-// CHECK-NEXT:    %[[RET:.+]] = arith.sitofp %[[ARG]] : tensor<3x?xi16> to tensor<3x?xf16>
-// CHECK-NEXT:    return %[[RET]] : tensor<3x?xf16>
-func.func @sitofp_extsi_tensor_i16(%a: tensor<3x?xi16>) -> tensor<3x?xf16> {
-  %b = arith.extsi %a : tensor<3x?xi16> to tensor<3x?xi32>
-  %f = arith.sitofp %b : tensor<3x?xi32> to tensor<3x?xf16>
-  return %f : tensor<3x?xf16>
-}
-
-// Narrowing to i64 is not enabled in pass options.
-//
-// CHECK-LABEL: func.func @sitofp_extsi_i64
-// CHECK-SAME:    (%[[ARG:.+]]: i64)
-// CHECK-NEXT:    %[[EXT:.+]] = arith.extsi %[[ARG]] : i64 to i128
-// CHECK-NEXT:    %[[RET:.+]] = arith.sitofp %[[EXT]] : i128 to f32
-// CHECK-NEXT:    return %[[RET]] : f32
-func.func @sitofp_extsi_i64(%a: i64) -> f32 {
-  %b = arith.extsi %a : i64 to i128
-  %f = arith.sitofp %b : i128 to f32
-  return %f : f32
-}
-
-// CHECK-LABEL: func.func @uitofp_extui_i16
-// CHECK-SAME:    (%[[ARG:.+]]: i16)
-// CHECK-NEXT:    %[[RET:.+]] = arith.uitofp %[[ARG]] : i16 to f16
-// CHECK-NEXT:    return %[[RET]] : f16
-func.func @uitofp_extui_i16(%a: i16) -> f16 {
-  %b = arith.extui %a : i16 to i32
-  %f = arith.uitofp %b : i32 to f16
-  return %f : f16
-}
-
-// CHECK-LABEL: func.func @sitofp_extsi_extsi_i8
-// CHECK-SAME:    (%[[ARG:.+]]: i8)
-// CHECK-NEXT:    %[[RET:.+]] = arith.sitofp %[[ARG]] : i8 to f16
-// CHECK-NEXT:    return %[[RET]] : f16
-func.func @sitofp_extsi_extsi_i8(%a: i8) -> f16 {
-  %b = arith.extsi %a : i8 to i16
-  %c = arith.extsi %b : i16 to i32
-  %f = arith.sitofp %c : i32 to f16
-  return %f : f16
-}
-
-// CHECK-LABEL: func.func @uitofp_extui_extui_i8
-// CHECK-SAME:    (%[[ARG:.+]]: i8)
-// CHECK-NEXT:    %[[RET:.+]] = arith.uitofp %[[ARG]] : i8 to f16
-// CHECK-NEXT:    return %[[RET]] : f16
-func.func @uitofp_extui_extui_i8(%a: i8) -> f16 {
-  %b = arith.extui %a : i8 to i16
-  %c = arith.extui %b : i16 to i32
-  %f = arith.uitofp %c : i32 to f16
-  return %f : f16
-}
-
-// CHECK-LABEL: func.func @uitofp_extsi_extui_i8
-// CHECK-SAME:    (%[[ARG:.+]]: i8)
-// CHECK-NEXT:    %[[EXT:.+]] = arith.extsi %[[ARG]] : i8 to i16
-// CHECK-NEXT:    %[[RET:.+]] = arith.uitofp %[[EXT]] : i16 to f16
-// CHECK-NEXT:    return %[[RET]] : f16
-func.func @uitofp_extsi_extui_i8(%a: i8) -> f16 {
-  %b = arith.extsi %a : i8 to i16
-  %c = arith.extui %b : i16 to i32
-  %f = arith.uitofp %c : i32 to f16
-  return %f : f16
-}
-
-// CHECK-LABEL: func.func @uitofp_trunci_extui_i8
-// CHECK-SAME:    (%[[ARG:.+]]: i16)
-// CHECK-NEXT:    %[[TR:.+]]  = arith.trunci %[[ARG]] : i16 to i8
-// CHECK-NEXT:    %[[RET:.+]] = arith.uitofp %[[TR]] : i8 to f16
-// CHECK-NEXT:    return %[[RET]] : f16
-func.func @uitofp_trunci_extui_i8(%a: i16) -> f16 {
-  %b = arith.trunci %a : i16 to i8
-  %c = arith.extui %b : i8 to i32
-  %f = arith.uitofp %c : i32 to f16
-  return %f : f16
-}
-
-// This should not be folded because arith.extui changes the signed
-// range of the number. For example:
-//  extsi -1 : i16 to i32 ==> -1
-//  extui -1 : i16 to i32 ==> U16_MAX
-//
-/// CHECK-LABEL: func.func @sitofp_extui_i16
-// CHECK-SAME:    (%[[ARG:.+]]: i16)
-// CHECK-NEXT:    %[[EXT:.+]] = arith.extui %[[ARG]] : i16 to i32
-// CHECK-NEXT:    %[[RET:.+]] = arith.sitofp %[[EXT]] : i32 to f16
-// CHECK-NEXT:    return %[[RET]] : f16
-func.func @sitofp_extui_i16(%a: i16) -> f16 {
-  %b = arith.extui %a : i16 to i32
-  %f = arith.sitofp %b : i32 to f16
-  return %f : f16
-}
-
-// This should not be folded because arith.extsi changes the unsigned
-// range of the number. For example:
-//  extsi -1 : i16 to i32 ==> U32_MAX
-//  extui -1 : i16 to i32 ==> U16_MAX
-//
-// CHECK-LABEL: func.func @uitofp_extsi_i16
-// CHECK-SAME:    (%[[ARG:.+]]: i16)
-// CHECK-NEXT:    %[[EXT:.+]] = arith.extsi %[[ARG]] : i16 to i32
-// CHECK-NEXT:    %[[RET:.+]] = arith.uitofp %[[EXT]] : i32 to f16
-// CHECK-NEXT:    return %[[RET]] : f16
-func.func @uitofp_extsi_i16(%a: i16) -> f16 {
-  %b = arith.extsi %a : i16 to i32
-  %f = arith.uitofp %b : i32 to f16
-  return %f : f16
-}
-
-//===----------------------------------------------------------------------===//
-// arith.maxsi
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: func.func @maxsi_extsi_i8
-// CHECK-SAME:    (%[[LHS:.+]]: i8, %[[RHS:.+]]: i8)
-// CHECK-NEXT:    %[[MAX:.+]]  = arith.maxsi %[[LHS]], %[[RHS]] : i8
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extsi %[[MAX]] : i8 to i32
-// CHECK-NEXT:    return %[[RET]] : i32
-func.func @maxsi_extsi_i8(%lhs: i8, %rhs: i8) -> i32 {
-  %a = arith.extsi %lhs : i8 to i32
-  %b = arith.extsi %rhs : i8 to i32
-  %r = arith.maxsi %a, %b : i32
-  return %r : i32
-}
-
-// This patterns should only apply to `arith.maxsi` ops with sign-extended
-// arguments.
-//
-// CHECK-LABEL: func.func @maxsi_extui_i8
-// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
-// CHECK-NEXT:    %[[EXT0:.+]] = arith.extui %[[ARG0]] : i8 to i32
-// CHECK-NEXT:    %[[EXT1:.+]] = arith.extui %[[ARG1]] : i8 to i32
-// CHECK-NEXT:    %[[MAX:.+]]  = arith.maxsi %[[EXT0]], %[[EXT1]] : i32
-// CHECK-NEXT:    return %[[MAX]] : i32
-func.func @maxsi_extui_i8(%lhs: i8, %rhs: i8) -> i32 {
-  %a = arith.extui %lhs : i8 to i32
-  %b = arith.extui %rhs : i8 to i32
-  %r = arith.maxsi %a, %b : i32
-  return %r : i32
-}
-
-//===----------------------------------------------------------------------===//
-// arith.maxui
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: func.func @maxui_extui_i8
-// CHECK-SAME:    (%[[LHS:.+]]: i8, %[[RHS:.+]]: i8)
-// CHECK-NEXT:    %[[MAX:.+]]  = arith.maxui %[[LHS]], %[[RHS]] : i8
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extui %[[MAX]] : i8 to i32
-// CHECK-NEXT:    return %[[RET]] : i32
-func.func @maxui_extui_i8(%lhs: i8, %rhs: i8) -> i32 {
-  %a = arith.extui %lhs : i8 to i32
-  %b = arith.extui %rhs : i8 to i32
-  %r = arith.maxui %a, %b : i32
-  return %r : i32
-}
-
-// This patterns should only apply to `arith.maxsi` ops with zero-extended
-// arguments.
-//
-// CHECK-LABEL: func.func @maxui_extsi_i8
-// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
-// CHECK-NEXT:    %[[EXT0:.+]] = arith.extsi %[[ARG0]] : i8 to i32
-// CHECK-NEXT:    %[[EXT1:.+]] = arith.extsi %[[ARG1]] : i8 to i32
-// CHECK-NEXT:    %[[MAX:.+]]  = arith.maxui %[[EXT0]], %[[EXT1]] : i32
-// CHECK-NEXT:    return %[[MAX]] : i32
-func.func @maxui_extsi_i8(%lhs: i8, %rhs: i8) -> i32 {
-  %a = arith.extsi %lhs : i8 to i32
-  %b = arith.extsi %rhs : i8 to i32
-  %r = arith.maxui %a, %b : i32
-  return %r : i32
-}
-
-//===----------------------------------------------------------------------===//
-// arith.minsi
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: func.func @minsi_extsi_i8
-// CHECK-SAME:    (%[[LHS:.+]]: i8, %[[RHS:.+]]: i8)
-// CHECK-NEXT:    %[[min:.+]]  = arith.minsi %[[LHS]], %[[RHS]] : i8
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extsi %[[min]] : i8 to i32
-// CHECK-NEXT:    return %[[RET]] : i32
-func.func @minsi_extsi_i8(%lhs: i8, %rhs: i8) -> i32 {
-  %a = arith.extsi %lhs : i8 to i32
-  %b = arith.extsi %rhs : i8 to i32
-  %r = arith.minsi %a, %b : i32
-  return %r : i32
-}
-
-// This patterns should only apply to `arith.minsi` ops with sign-extended
-// arguments.
-//
-// CHECK-LABEL: func.func @minsi_extui_i8
-// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
-// CHECK-NEXT:    %[[EXT0:.+]] = arith.extui %[[ARG0]] : i8 to i32
-// CHECK-NEXT:    %[[EXT1:.+]] = arith.extui %[[ARG1]] : i8 to i32
-// CHECK-NEXT:    %[[min:.+]]  = arith.minsi %[[EXT0]], %[[EXT1]] : i32
-// CHECK-NEXT:    return %[[min]] : i32
-func.func @minsi_extui_i8(%lhs: i8, %rhs: i8) -> i32 {
-  %a = arith.extui %lhs : i8 to i32
-  %b = arith.extui %rhs : i8 to i32
-  %r = arith.minsi %a, %b : i32
-  return %r : i32
-}
-
-//===----------------------------------------------------------------------===//
-// arith.minui
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: func.func @minui_extui_i8
-// CHECK-SAME:    (%[[LHS:.+]]: i8, %[[RHS:.+]]: i8)
-// CHECK-NEXT:    %[[min:.+]]  = arith.minui %[[LHS]], %[[RHS]] : i8
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extui %[[min]] : i8 to i32
-// CHECK-NEXT:    return %[[RET]] : i32
-func.func @minui_extui_i8(%lhs: i8, %rhs: i8) -> i32 {
-  %a = arith.extui %lhs : i8 to i32
-  %b = arith.extui %rhs : i8 to i32
-  %r = arith.minui %a, %b : i32
-  return %r : i32
-}
-
-// This patterns should only apply to `arith.minsi` ops with zero-extended
-// arguments.
-//
-// CHECK-LABEL: func.func @minui_extsi_i8
-// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
-// CHECK-NEXT:    %[[EXT0:.+]] = arith.extsi %[[ARG0]] : i8 to i32
-// CHECK-NEXT:    %[[EXT1:.+]] = arith.extsi %[[ARG1]] : i8 to i32
-// CHECK-NEXT:    %[[min:.+]]  = arith.minui %[[EXT0]], %[[EXT1]] : i32
-// CHECK-NEXT:    return %[[min]] : i32
-func.func @minui_extsi_i8(%lhs: i8, %rhs: i8) -> i32 {
-  %a = arith.extsi %lhs : i8 to i32
-  %b = arith.extsi %rhs : i8 to i32
-  %r = arith.minui %a, %b : i32
-  return %r : i32
-}
-
-//===----------------------------------------------------------------------===//
-// Commute Extension over Vector Ops
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: func.func @extsi_over_extract_3xi16
-// CHECK-SAME:    (%[[ARG:.+]]: vector<3xi16>)
-// CHECK-NEXT:    %[[EXTR:.+]] = vector.extract %[[ARG]][1] : i16 from vector<3xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.sitofp %[[EXTR]] : i16 to f16
-// CHECK-NEXT:    return %[[RET]] : f16
-func.func @extsi_over_extract_3xi16(%a: vector<3xi16>) -> f16 {
-  %b = arith.extsi %a : vector<3xi16> to vector<3xi32>
-  %c = vector.extract %b[1] : i32 from vector<3xi32>
-  %f = arith.sitofp %c : i32 to f16
-  return %f : f16
-}
-
-// CHECK-LABEL: func.func @extui_over_extract_3xi16
-// CHECK-SAME:    (%[[ARG:.+]]: vector<3xi16>)
-// CHECK-NEXT:    %[[EXTR:.+]] = vector.extract %[[ARG]][1] : i16 from vector<3xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.uitofp %[[EXTR]] : i16 to f16
-// CHECK-NEXT:    return %[[RET]] : f16
-func.func @extui_over_extract_3xi16(%a: vector<3xi16>) -> f16 {
-  %b = arith.extui %a : vector<3xi16> to vector<3xi32>
-  %c = vector.extract %b[1] : i32 from vector<3xi32>
-  %f = arith.uitofp %c : i32 to f16
-  return %f : f16
-}
-
-// CHECK-LABEL: func.func @extsi_over_extractelement_3xi16
-// CHECK-SAME:    (%[[ARG:.+]]: vector<3xi16>, %[[POS:.+]]: i32)
-// CHECK-NEXT:    %[[EXTR:.+]] = vector.extractelement %[[ARG]][%[[POS]] : i32] : vector<3xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.sitofp %[[EXTR]] : i16 to f16
-// CHECK-NEXT:    return %[[RET]] : f16
-func.func @extsi_over_extractelement_3xi16(%a: vector<3xi16>, %pos: i32) -> f16 {
-  %b = arith.extsi %a : vector<3xi16> to vector<3xi32>
-  %c = vector.extractelement %b[%pos : i32] : vector<3xi32>
-  %f = arith.sitofp %c : i32 to f16
-  return %f : f16
-}
-
-// CHECK-LABEL: func.func @extui_over_extractelement_3xi16
-// CHECK-SAME:    (%[[ARG:.+]]: vector<3xi16>, %[[POS:.+]]: i32)
-// CHECK-NEXT:    %[[EXTR:.+]] = vector.extractelement %[[ARG]][%[[POS]] : i32] : vector<3xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.uitofp %[[EXTR]] : i16 to f16
-// CHECK-NEXT:    return %[[RET]] : f16
-func.func @extui_over_extractelement_3xi16(%a: vector<3xi16>, %pos: i32) -> f16 {
-  %b = arith.extui %a : vector<3xi16> to vector<3xi32>
-  %c = vector.extractelement %b[%pos : i32] : vector<3xi32>
-  %f = arith.uitofp %c : i32 to f16
-  return %f : f16
-}
-
-// CHECK-LABEL: func.func @extsi_over_extract_strided_slice_1d
-// CHECK-SAME:    (%[[ARG:.+]]: vector<3xi16>)
-// CHECK-NEXT:    %[[EXTR:.+]] = vector.extract_strided_slice %[[ARG]] {offsets = [1], sizes = [2], strides = [1]} : vector<3xi16> to vector<2xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extsi %[[EXTR]] : vector<2xi16> to vector<2xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<2xi32>
-func.func @extsi_over_extract_strided_slice_1d(%a: vector<3xi16>) -> vector<2xi32> {
-  %b = arith.extsi %a : vector<3xi16> to vector<3xi32>
-  %c = vector.extract_strided_slice %b
-   {offsets = [1], sizes = [2], strides = [1]} : vector<3xi32> to vector<2xi32>
-  return %c : vector<2xi32>
-}
-
-// CHECK-LABEL: func.func @extui_over_extract_strided_slice_1d
-// CHECK-SAME:    (%[[ARG:.+]]: vector<3xi16>)
-// CHECK-NEXT:    %[[EXTR:.+]] = vector.extract_strided_slice %[[ARG]] {offsets = [1], sizes = [2], strides = [1]} : vector<3xi16> to vector<2xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extui %[[EXTR]] : vector<2xi16> to vector<2xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<2xi32>
-func.func @extui_over_extract_strided_slice_1d(%a: vector<3xi16>) -> vector<2xi32> {
-  %b = arith.extui %a : vector<3xi16> to vector<3xi32>
-  %c = vector.extract_strided_slice %b
-   {offsets = [1], sizes = [2], strides = [1]} : vector<3xi32> to vector<2xi32>
-  return %c : vector<2xi32>
-}
-
-// CHECK-LABEL: func.func @extsi_over_extract_strided_slice_2d
-// CHECK-SAME:    (%[[ARG:.+]]: vector<2x3xi16>)
-// CHECK-NEXT:    %[[EXTR:.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 1], sizes = [1, 2], strides = [1, 1]} : vector<2x3xi16> to vector<1x2xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extsi %[[EXTR]] : vector<1x2xi16> to vector<1x2xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<1x2xi32>
-func.func @extsi_over_extract_strided_slice_2d(%a: vector<2x3xi16>) -> vector<1x2xi32> {
-  %b = arith.extsi %a : vector<2x3xi16> to vector<2x3xi32>
-  %c = vector.extract_strided_slice %b
-   {offsets = [1, 1], sizes = [1, 2], strides = [1, 1]} : vector<2x3xi32> to vector<1x2xi32>
-  return %c : vector<1x2xi32>
-}
-
-// CHECK-LABEL: func.func @extui_over_extract_strided_slice_2d
-// CHECK-SAME:    (%[[ARG:.+]]: vector<2x3xi16>)
-// CHECK-NEXT:    %[[EXTR:.+]] = vector.extract_strided_slice %arg0 {offsets = [1, 1], sizes = [1, 2], strides = [1, 1]} : vector<2x3xi16> to vector<1x2xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extui %[[EXTR]] : vector<1x2xi16> to vector<1x2xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<1x2xi32>
-func.func @extui_over_extract_strided_slice_2d(%a: vector<2x3xi16>) -> vector<1x2xi32> {
-  %b = arith.extui %a : vector<2x3xi16> to vector<2x3xi32>
-  %c = vector.extract_strided_slice %b
-   {offsets = [1, 1], sizes = [1, 2], strides = [1, 1]} : vector<2x3xi32> to vector<1x2xi32>
-  return %c : vector<1x2xi32>
-}
-
-// CHECK-LABEL: func.func @extsi_over_insert_3xi16
-// CHECK-SAME:    (%[[ARG0:.+]]: vector<3xi16>, %[[ARG1:.+]]: i16)
-// CHECK-NEXT:    %[[INS:.+]] = vector.insert %[[ARG1]], %[[ARG0]] [1] : i16 into vector<3xi16>
-// CHECK-NEXT:    %[[RET:.+]] = arith.extsi %[[INS]] : vector<3xi16> to vector<3xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<3xi32>
-func.func @extsi_over_insert_3xi16(%a: vector<3xi16>, %b: i16) -> vector<3xi32> {
-  %c = arith.extsi %a : vector<3xi16> to vector<3xi32>
-  %d = arith.extsi %b : i16 to i32
-  %e = vector.insert %d, %c [1] : i32 into vector<3xi32>
-  return %e : vector<3xi32>
-}
-
-// CHECK-LABEL: func.func @extui_over_insert_3xi16
-// CHECK-SAME:    (%[[ARG0:.+]]: vector<3xi16>, %[[ARG1:.+]]: i16)
-// CHECK-NEXT:    %[[INS:.+]] = vector.insert %[[ARG1]], %[[ARG0]] [1] : i16 into vector<3xi16>
-// CHECK-NEXT:    %[[RET:.+]] = arith.extui %[[INS]] : vector<3xi16> to vector<3xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<3xi32>
-func.func @extui_over_insert_3xi16(%a: vector<3xi16>, %b: i16) -> vector<3xi32> {
-  %c = arith.extui %a : vector<3xi16> to vector<3xi32>
-  %d = arith.extui %b : i16 to i32
-  %e = vector.insert %d, %c [1] : i32 into vector<3xi32>
-  return %e : vector<3xi32>
-}
-
-// CHECK-LABEL: func.func @extsi_over_insert_3xi16_cst_0
-// CHECK-SAME:    (%[[ARG:.+]]: i16)
-// CHECK-NEXT:    %[[CST:.+]] = arith.constant dense<0> : vector<3xi16>
-// CHECK-NEXT:    %[[INS:.+]] = vector.insert %[[ARG]], %[[CST]] [1] : i16 into vector<3xi16>
-// CHECK-NEXT:    %[[RET:.+]] = arith.extsi %[[INS]] : vector<3xi16> to vector<3xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<3xi32>
-func.func @extsi_over_insert_3xi16_cst_0(%a: i16) -> vector<3xi32> {
-  %cst = arith.constant dense<0> : vector<3xi32>
-  %d = arith.extsi %a : i16 to i32
-  %e = vector.insert %d, %cst [1] : i32 into vector<3xi32>
-  return %e : vector<3xi32>
-}
-
-// CHECK-LABEL: func.func @extsi_over_insert_3xi8_cst
-// CHECK-SAME:    (%[[ARG:.+]]: i8)
-// CHECK-NEXT:    %[[CST:.+]] = arith.constant dense<[-1, 127, -128]> : vector<3xi8>
-// CHECK-NEXT:    %[[INS:.+]] = vector.insert %[[ARG]], %[[CST]] [1] : i8 into vector<3xi8>
-// CHECK-NEXT:    %[[RET:.+]] = arith.extsi %[[INS]] : vector<3xi8> to vector<3xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<3xi32>
-func.func @extsi_over_insert_3xi8_cst(%a: i8) -> vector<3xi32> {
-  %cst = arith.constant dense<[-1, 127, -128]> : vector<3xi32>
-  %d = arith.extsi %a : i8 to i32
-  %e = vector.insert %d, %cst [1] : i32 into vector<3xi32>
-  return %e : vector<3xi32>
-}
-
-// CHECK-LABEL: func.func @extui_over_insert_3xi8_cst
-// CHECK-SAME:    (%[[ARG:.+]]: i8)
-// CHECK-NEXT:    %[[CST:.+]] = arith.constant dense<[1, 127, -1]> : vector<3xi8>
-// CHECK-NEXT:    %[[INS:.+]] = vector.insert %[[ARG]], %[[CST]] [1] : i8 into vector<3xi8>
-// CHECK-NEXT:    %[[RET:.+]] = arith.extui %[[INS]] : vector<3xi8> to vector<3xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<3xi32>
-func.func @extui_over_insert_3xi8_cst(%a: i8) -> vector<3xi32> {
-  %cst = arith.constant dense<[1, 127, 255]> : vector<3xi32>
-  %d = arith.extui %a : i8 to i32
-  %e = vector.insert %d, %cst [1] : i32 into vector<3xi32>
-  return %e : vector<3xi32>
-}
-
-// CHECK-LABEL: func.func @extsi_over_insert_3xi16_cst_i16
-// CHECK-SAME:    (%[[ARG:.+]]: i8)
-// CHECK-NEXT:    %[[CST:.+]]  = arith.constant dense<[-1, 128, 0]> : vector<3xi16>
-// CHECK-NEXT:    %[[SRCE:.+]] = arith.extsi %[[ARG]] : i8 to i32
-// CHECK-NEXT:    %[[SRCT:.+]] = arith.trunci %[[SRCE]] : i32 to i16
-// CHECK-NEXT:    %[[INS:.+]]  = vector.insert %[[SRCT]], %[[CST]] [1] : i16 into vector<3xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extsi %[[INS]] : vector<3xi16> to vector<3xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<3xi32>
-func.func @extsi_over_insert_3xi16_cst_i16(%a: i8) -> vector<3xi32> {
-  %cst = arith.constant dense<[-1, 128, 0]> : vector<3xi32>
-  %d = arith.extsi %a : i8 to i32
-  %e = vector.insert %d, %cst [1] : i32 into vector<3xi32>
-  return %e : vector<3xi32>
-}
-
-// CHECK-LABEL: func.func @extui_over_insert_3xi16_cst_i16
-// CHECK-SAME:    (%[[ARG:.+]]: i8)
-// CHECK-NEXT:    %[[CST:.+]]  = arith.constant dense<[1, 256, 0]> : vector<3xi16>
-// CHECK-NEXT:    %[[SRCE:.+]] = arith.extui %[[ARG]] : i8 to i32
-// CHECK-NEXT:    %[[SRCT:.+]] = arith.trunci %[[SRCE]] : i32 to i16
-// CHECK-NEXT:    %[[INS:.+]]  = vector.insert %[[SRCT]], %[[CST]] [1] : i16 into vector<3xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extui %[[INS]] : vector<3xi16> to vector<3xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<3xi32>
-func.func @extui_over_insert_3xi16_cst_i16(%a: i8) -> vector<3xi32> {
-  %cst = arith.constant dense<[1, 256, 0]> : vector<3xi32>
-  %d = arith.extui %a : i8 to i32
-  %e = vector.insert %d, %cst [1] : i32 into vector<3xi32>
-  return %e : vector<3xi32>
-}
-
-// CHECK-LABEL: func.func @extsi_over_insertelement_3xi16
-// CHECK-SAME:    (%[[ARG0:.+]]: vector<3xi16>, %[[ARG1:.+]]: i16, %[[POS:.+]]: i32)
-// CHECK-NEXT:    %[[INS:.+]] = vector.insertelement %[[ARG1]], %[[ARG0]][%[[POS]] : i32] : vector<3xi16>
-// CHECK-NEXT:    %[[RET:.+]] = arith.extsi %[[INS]] : vector<3xi16> to vector<3xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<3xi32>
-func.func @extsi_over_insertelement_3xi16(%a: vector<3xi16>, %b: i16, %pos: i32) -> vector<3xi32> {
-  %c = arith.extsi %a : vector<3xi16> to vector<3xi32>
-  %d = arith.extsi %b : i16 to i32
-  %e = vector.insertelement %d, %c[%pos : i32] : vector<3xi32>
-  return %e : vector<3xi32>
-}
-
-// CHECK-LABEL: func.func @extui_over_insertelement_3xi16
-// CHECK-SAME:    (%[[ARG0:.+]]: vector<3xi16>, %[[ARG1:.+]]: i16, %[[POS:.+]]: i32)
-// CHECK-NEXT:    %[[INS:.+]] = vector.insertelement %[[ARG1]], %[[ARG0]][%[[POS]] : i32] : vector<3xi16>
-// CHECK-NEXT:    %[[RET:.+]] = arith.extui %[[INS]] : vector<3xi16> to vector<3xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<3xi32>
-func.func @extui_over_insertelement_3xi16(%a: vector<3xi16>, %b: i16, %pos: i32) -> vector<3xi32> {
-  %c = arith.extui %a : vector<3xi16> to vector<3xi32>
-  %d = arith.extui %b : i16 to i32
-  %e = vector.insertelement %d, %c[%pos : i32] : vector<3xi32>
-  return %e : vector<3xi32>
-}
-
-// CHECK-LABEL: func.func @extsi_over_insertelement_3xi16_cst_i16
-// CHECK-SAME:    (%[[ARG:.+]]: i8, %[[POS:.+]]: i32)
-// CHECK-NEXT:    %[[CST:.+]]  = arith.constant dense<[-1, 128, 0]> : vector<3xi16>
-// CHECK-NEXT:    %[[SRCE:.+]] = arith.extsi %[[ARG]] : i8 to i32
-// CHECK-NEXT:    %[[SRCT:.+]] = arith.trunci %[[SRCE]] : i32 to i16
-// CHECK-NEXT:    %[[INS:.+]] = vector.insertelement %[[SRCT]], %[[CST]][%[[POS]] : i32] : vector<3xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extsi %[[INS]] : vector<3xi16> to vector<3xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<3xi32>
-func.func @extsi_over_insertelement_3xi16_cst_i16(%a: i8, %pos: i32) -> vector<3xi32> {
-  %cst = arith.constant dense<[-1, 128, 0]> : vector<3xi32>
-  %d = arith.extsi %a : i8 to i32
-  %e = vector.insertelement %d, %cst[%pos : i32] : vector<3xi32>
-  return %e : vector<3xi32>
-}
-
-// CHECK-LABEL: func.func @extui_over_insertelement_3xi16_cst_i16
-// CHECK-SAME:    (%[[ARG:.+]]: i8, %[[POS:.+]]: i32)
-// CHECK-NEXT:    %[[CST:.+]]  = arith.constant dense<[1, 256, 0]> : vector<3xi16>
-// CHECK-NEXT:    %[[SRCE:.+]] = arith.extui %[[ARG]] : i8 to i32
-// CHECK-NEXT:    %[[SRCT:.+]] = arith.trunci %[[SRCE]] : i32 to i16
-// CHECK-NEXT:    %[[INS:.+]] = vector.insertelement %[[SRCT]], %[[CST]][%[[POS]] : i32] : vector<3xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extui %[[INS]] : vector<3xi16> to vector<3xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<3xi32>
-func.func @extui_over_insertelement_3xi16_cst_i16(%a: i8, %pos: i32) -> vector<3xi32> {
-  %cst = arith.constant dense<[1, 256, 0]> : vector<3xi32>
-  %d = arith.extui %a : i8 to i32
-  %e = vector.insertelement %d, %cst[%pos : i32] : vector<3xi32>
-  return %e : vector<3xi32>
-}
-
-// CHECK-LABEL: func.func @extsi_over_insert_strided_slice_1d
-// CHECK-SAME:    (%[[ARG0:.+]]: vector<3xi16>, %[[ARG1:.+]]: vector<2xi16>)
-// CHECK-NEXT:    %[[INS:.+]] = vector.insert_strided_slice %[[ARG1]], %[[ARG0]]
-// CHECK-SAME:                    {offsets = [1], strides = [1]} : vector<2xi16> into vector<3xi16>
-// CHECK-NEXT:    %[[RET:.+]] = arith.extsi %[[INS]] : vector<3xi16> to vector<3xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<3xi32>
-func.func @extsi_over_insert_strided_slice_1d(%a: vector<3xi16>, %b: vector<2xi16>) -> vector<3xi32> {
-  %c = arith.extsi %a : vector<3xi16> to vector<3xi32>
-  %d = arith.extsi %b : vector<2xi16> to vector<2xi32>
-  %e = vector.insert_strided_slice %d, %c {offsets = [1], strides = [1]} : vector<2xi32> into vector<3xi32>
-  return %e : vector<3xi32>
-}
-
-// CHECK-LABEL: func.func @extui_over_insert_strided_slice_1d
-// CHECK-SAME:    (%[[ARG0:.+]]: vector<3xi16>, %[[ARG1:.+]]: vector<2xi16>)
-// CHECK-NEXT:    %[[INS:.+]] = vector.insert_strided_slice %[[ARG1]], %[[ARG0]]
-// CHECK-SAME:                    {offsets = [1], strides = [1]} : vector<2xi16> into vector<3xi16>
-// CHECK-NEXT:    %[[RET:.+]] = arith.extui %[[INS]] : vector<3xi16> to vector<3xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<3xi32>
-func.func @extui_over_insert_strided_slice_1d(%a: vector<3xi16>, %b: vector<2xi16>) -> vector<3xi32> {
-  %c = arith.extui %a : vector<3xi16> to vector<3xi32>
-  %d = arith.extui %b : vector<2xi16> to vector<2xi32>
-  %e = vector.insert_strided_slice %d, %c {offsets = [1], strides = [1]} : vector<2xi32> into vector<3xi32>
-  return %e : vector<3xi32>
-}
-
-// CHECK-LABEL: func.func @extsi_over_insert_strided_slice_cst_2d
-// CHECK-SAME:    (%[[ARG:.+]]: vector<1x2xi8>)
-// CHECK-NEXT:    %[[CST:.+]]  = arith.constant
-// CHECK-SAME{LITERAL}:            dense<[[-1, 128, 0], [-129, 42, 1337]]> : vector<2x3xi16>
-// CHECK-NEXT:    %[[SRCE:.+]] = arith.extsi %[[ARG]] : vector<1x2xi8> to vector<1x2xi32>
-// CHECK-NEXT:    %[[SRCT:.+]] = arith.trunci %[[SRCE]] : vector<1x2xi32> to vector<1x2xi16>
-// CHECK-NEXT:    %[[INS:.+]] = vector.insert_strided_slice %[[SRCT]], %[[CST]]
-// CHECK-SAME:                    {offsets = [0, 1], strides = [1, 1]} : vector<1x2xi16> into vector<2x3xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extsi %[[INS]] : vector<2x3xi16> to vector<2x3xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<2x3xi32>
-func.func @extsi_over_insert_strided_slice_cst_2d(%a: vector<1x2xi8>) -> vector<2x3xi32> {
-  %cst = arith.constant dense<[[-1, 128, 0], [-129, 42, 1337]]> : vector<2x3xi32>
-  %d = arith.extsi %a : vector<1x2xi8> to vector<1x2xi32>
-  %e = vector.insert_strided_slice %d, %cst {offsets = [0, 1], strides = [1, 1]} : vector<1x2xi32> into vector<2x3xi32>
-  return %e : vector<2x3xi32>
-}
-
-// CHECK-LABEL: func.func @extui_over_insert_strided_slice_cst_2d
-// CHECK-SAME:    (%[[ARG:.+]]: vector<1x2xi8>)
-// CHECK-NEXT:    %[[CST:.+]]  = arith.constant
-// CHECK-SAME{LITERAL}:            dense<[[1, 128, 0], [256, 42, 1337]]> : vector<2x3xi16>
-// CHECK-NEXT:    %[[SRCE:.+]] = arith.extui %[[ARG]] : vector<1x2xi8> to vector<1x2xi32>
-// CHECK-NEXT:    %[[SRCT:.+]] = arith.trunci %[[SRCE]] : vector<1x2xi32> to vector<1x2xi16>
-// CHECK-NEXT:    %[[INS:.+]] = vector.insert_strided_slice %[[SRCT]], %[[CST]]
-// CHECK-SAME:                    {offsets = [0, 1], strides = [1, 1]} : vector<1x2xi16> into vector<2x3xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extui %[[INS]] : vector<2x3xi16> to vector<2x3xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<2x3xi32>
-func.func @extui_over_insert_strided_slice_cst_2d(%a: vector<1x2xi8>) -> vector<2x3xi32> {
-  %cst = arith.constant dense<[[1, 128, 0], [256, 42, 1337]]> : vector<2x3xi32>
-  %d = arith.extui %a : vector<1x2xi8> to vector<1x2xi32>
-  %e = vector.insert_strided_slice %d, %cst {offsets = [0, 1], strides = [1, 1]} : vector<1x2xi32> into vector<2x3xi32>
-  return %e : vector<2x3xi32>
-}
-
-// CHECK-LABEL: func.func @extsi_over_broadcast_3xi16
-// CHECK-SAME:    (%[[ARG:.+]]: i16)
-// CHECK-NEXT:    %[[BCST:.+]] = vector.broadcast %[[ARG]] : i16 to vector<3xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extsi %[[BCST]] : vector<3xi16> to vector<3xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<3xi32>
-func.func @extsi_over_broadcast_3xi16(%a: i16) -> vector<3xi32> {
-  %b = arith.extsi %a : i16 to i32
-  %r = vector.broadcast %b : i32 to vector<3xi32>
-  return %r : vector<3xi32>
-}
-
-// CHECK-LABEL: func.func @extui_over_broadcast_2x3xi16
-// CHECK-SAME:    (%[[ARG:.+]]: vector<3xi16>)
-// CHECK-NEXT:    %[[BCST:.+]] = vector.broadcast %[[ARG]] : vector<3xi16> to vector<2x3xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extui %[[BCST]] : vector<2x3xi16> to vector<2x3xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<2x3xi32>
-func.func @extui_over_broadcast_2x3xi16(%a: vector<3xi16>) -> vector<2x3xi32> {
-  %b = arith.extui %a : vector<3xi16> to vector<3xi32>
-  %r = vector.broadcast %b : vector<3xi32> to vector<2x3xi32>
-  return %r : vector<2x3xi32>
-}
-
-// CHECK-LABEL: func.func @extsi_over_shape_cast_2x3xi16
-// CHECK-SAME:    (%[[ARG:.+]]: vector<2x3xi16>)
-// CHECK-NEXT:    %[[CAST:.+]] = vector.shape_cast %[[ARG]] : vector<2x3xi16> to vector<3x2xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extsi %[[CAST]] : vector<3x2xi16> to vector<3x2xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<3x2xi32>
-func.func @extsi_over_shape_cast_2x3xi16(%a: vector<2x3xi16>) -> vector<3x2xi32> {
-  %b = arith.extsi %a : vector<2x3xi16> to vector<2x3xi32>
-  %r = vector.shape_cast %b : vector<2x3xi32> to vector<3x2xi32>
-  return %r : vector<3x2xi32>
-}
-
-// CHECK-LABEL: func.func @extui_over_shape_cast_5x2x3xi16
-// CHECK-SAME:    (%[[ARG:.+]]: vector<5x2x3xi16>)
-// CHECK-NEXT:    %[[CAST:.+]] = vector.shape_cast %[[ARG]] : vector<5x2x3xi16> to vector<2x3x5xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extui %[[CAST]] : vector<2x3x5xi16> to vector<2x3x5xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<2x3x5xi32>
-func.func @extui_over_shape_cast_5x2x3xi16(%a: vector<5x2x3xi16>) -> vector<2x3x5xi32> {
-  %b = arith.extui %a : vector<5x2x3xi16> to vector<5x2x3xi32>
-  %r = vector.shape_cast %b : vector<5x2x3xi32> to vector<2x3x5xi32>
-  return %r : vector<2x3x5xi32>
-}
-
-// CHECK-LABEL: func.func @extsi_over_transpose_2x3xi16
-// CHECK-SAME:    (%[[ARG:.+]]: vector<2x3xi16>)
-// CHECK-NEXT:    %[[TRAN:.+]] = vector.transpose %[[ARG]], [1, 0] : vector<2x3xi16> to vector<3x2xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extsi %[[TRAN]] : vector<3x2xi16> to vector<3x2xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<3x2xi32>
-func.func @extsi_over_transpose_2x3xi16(%a: vector<2x3xi16>) -> vector<3x2xi32> {
-  %b = arith.extsi %a : vector<2x3xi16> to vector<2x3xi32>
-  %r = vector.transpose %b, [1, 0] : vector<2x3xi32> to vector<3x2xi32>
-  return %r : vector<3x2xi32>
-}
-
-// CHECK-LABEL: func.func @extui_over_transpose_5x2x3xi16
-// CHECK-SAME:    (%[[ARG:.+]]: vector<5x2x3xi16>)
-// CHECK-NEXT:    %[[TRAN:.+]] = vector.transpose %[[ARG]], [1, 2, 0] : vector<5x2x3xi16> to vector<2x3x5xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extui %[[TRAN]] : vector<2x3x5xi16> to vector<2x3x5xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<2x3x5xi32>
-func.func @extui_over_transpose_5x2x3xi16(%a: vector<5x2x3xi16>) -> vector<2x3x5xi32> {
-  %b = arith.extui %a : vector<5x2x3xi16> to vector<5x2x3xi32>
-  %r = vector.transpose %b, [1, 2, 0] : vector<5x2x3xi32> to vector<2x3x5xi32>
-  return %r : vector<2x3x5xi32>
-}
-
-// CHECK-LABEL: func.func @extsi_over_flat_transpose_16xi16
-// CHECK-SAME:    (%[[ARG:.+]]: vector<16xi16>)
-// CHECK-NEXT:    %[[TRAN:.+]] = vector.flat_transpose %[[ARG]] {columns = 4 : i32, rows = 4 : i32} : vector<16xi16> -> vector<16xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extsi %[[TRAN]] : vector<16xi16> to vector<16xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<16xi32>
-func.func @extsi_over_flat_transpose_16xi16(%a: vector<16xi16>) -> vector<16xi32> {
-  %b = arith.extsi %a : vector<16xi16> to vector<16xi32>
-  %r = vector.flat_transpose %b {columns = 4 : i32, rows = 4 : i32} : vector<16xi32> -> vector<16xi32>
-  return %r : vector<16xi32>
-}
-
-// CHECK-LABEL: func.func @extui_over_flat_transpose_16xi16
-// CHECK-SAME:    (%[[ARG:.+]]: vector<16xi16>)
-// CHECK-NEXT:    %[[TRAN:.+]] = vector.flat_transpose %[[ARG]] {columns = 8 : i32, rows = 2 : i32} : vector<16xi16> -> vector<16xi16>
-// CHECK-NEXT:    %[[RET:.+]]  = arith.extui %[[TRAN]] : vector<16xi16> to vector<16xi32>
-// CHECK-NEXT:    return %[[RET]] : vector<16xi32>
-func.func @extui_over_flat_transpose_16xi16(%a: vector<16xi16>) -> vector<16xi32> {
-  %b = arith.extui %a : vector<16xi16> to vector<16xi32>
-  %r = vector.flat_transpose %b {columns = 8 : i32, rows = 2 : i32} : vector<16xi32> -> vector<16xi32>
-  return %r : vector<16xi32>
-}
diff --git a/mlir/test/Dialect/Arith/int-range-narrowing.mlir b/mlir/test/Dialect/Arith/int-range-narrowing.mlir
new file mode 100644
index 0000000..8893f299
--- /dev/null
+++ b/mlir/test/Dialect/Arith/int-range-narrowing.mlir
@@ -0,0 +1,265 @@
+// RUN: mlir-opt --arith-int-range-narrowing="int-bitwidths-supported=1,8,16,24,32" %s | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// Some basic tests
+//===----------------------------------------------------------------------===//
+
+// Do not truncate negative values
+// CHECK-LABEL: func @test_addi_neg
+//       CHECK:  %[[RES:.*]] = arith.addi %{{.*}}, %{{.*}} : index
+//       CHECK:  return %[[RES]] : index
+func.func @test_addi_neg() -> index {
+  %0 = test.with_bounds { umin = 0 : index, umax = 1 : index, smin = 0 : index, smax = 1 : index } : index
+  %1 = test.with_bounds { umin = 0 : index, umax = -1 : index, smin = -1 : index, smax = 0 : index } : index
+  %2 = arith.addi %0, %1 : index
+  return %2 : index
+}
+
+// CHECK-LABEL: func @test_addi
+//       CHECK:  %[[A:.*]] = test.with_bounds {smax = 5 : index, smin = 4 : index, umax = 5 : index, umin = 4 : index} : index
+//       CHECK:  %[[B:.*]] = test.with_bounds {smax = 7 : index, smin = 6 : index, umax = 7 : index, umin = 6 : index} : index
+//       CHECK:  %[[A_CASTED:.*]] = arith.index_castui %[[A]] : index to i8
+//       CHECK:  %[[B_CASTED:.*]] = arith.index_castui %[[B]] : index to i8
+//       CHECK:  %[[RES:.*]] = arith.addi %[[A_CASTED]], %[[B_CASTED]] : i8
+//       CHECK:  %[[RES_CASTED:.*]] = arith.index_castui %[[RES]] : i8 to index
+//       CHECK:  return %[[RES_CASTED]] : index
+func.func @test_addi() -> index {
+  %0 = test.with_bounds { umin = 4 : index, umax = 5 : index, smin = 4 : index, smax = 5 : index } : index
+  %1 = test.with_bounds { umin = 6 : index, umax = 7 : index, smin = 6 : index, smax = 7 : index } : index
+  %2 = arith.addi %0, %1 : index
+  return %2 : index
+}
+
+// CHECK-LABEL: func @test_addi_vec
+//       CHECK:  %[[A:.*]] = test.with_bounds {smax = 5 : index, smin = 4 : index, umax = 5 : index, umin = 4 : index} : vector<4xindex>
+//       CHECK:  %[[B:.*]] = test.with_bounds {smax = 7 : index, smin = 6 : index, umax = 7 : index, umin = 6 : index} : vector<4xindex>
+//       CHECK:  %[[A_CASTED:.*]] = arith.index_castui %[[A]] : vector<4xindex> to vector<4xi8>
+//       CHECK:  %[[B_CASTED:.*]] = arith.index_castui %[[B]] : vector<4xindex> to vector<4xi8>
+//       CHECK:  %[[RES:.*]] = arith.addi %[[A_CASTED]], %[[B_CASTED]] : vector<4xi8>
+//       CHECK:  %[[RES_CASTED:.*]] = arith.index_castui %[[RES]] : vector<4xi8> to vector<4xindex>
+//       CHECK:  return %[[RES_CASTED]] : vector<4xindex>
+func.func @test_addi_vec() -> vector<4xindex> {
+  %0 = test.with_bounds { umin = 4 : index, umax = 5 : index, smin = 4 : index, smax = 5 : index } : vector<4xindex>
+  %1 = test.with_bounds { umin = 6 : index, umax = 7 : index, smin = 6 : index, smax = 7 : index } : vector<4xindex>
+  %2 = arith.addi %0, %1 : vector<4xindex>
+  return %2 : vector<4xindex>
+}
+
+// CHECK-LABEL: func @test_addi_i64
+//       CHECK:  %[[A:.*]] = test.with_bounds {smax = 5 : i64, smin = 4 : i64, umax = 5 : i64, umin = 4 : i64} : i64
+//       CHECK:  %[[B:.*]] = test.with_bounds {smax = 7 : i64, smin = 6 : i64, umax = 7 : i64, umin = 6 : i64} : i64
+//       CHECK:  %[[A_CASTED:.*]] = arith.trunci %[[A]] : i64 to i8
+//       CHECK:  %[[B_CASTED:.*]] = arith.trunci %[[B]] : i64 to i8
+//       CHECK:  %[[RES:.*]] = arith.addi %[[A_CASTED]], %[[B_CASTED]] : i8
+//       CHECK:  %[[RES_CASTED:.*]] = arith.extui %[[RES]] : i8 to i64
+//       CHECK:  return %[[RES_CASTED]] : i64
+func.func @test_addi_i64() -> i64 {
+  %0 = test.with_bounds { umin = 4 : i64, umax = 5 : i64, smin = 4 : i64, smax = 5 : i64 } : i64
+  %1 = test.with_bounds { umin = 6 : i64, umax = 7 : i64, smin = 6 : i64, smax = 7 : i64 } : i64
+  %2 = arith.addi %0, %1 : i64
+  return %2 : i64
+}
+
+// CHECK-LABEL: func @test_cmpi
+//       CHECK:  %[[A:.*]] = test.with_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index} : index
+//       CHECK:  %[[B:.*]] = test.with_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index} : index
+//       CHECK:  %[[A_CASTED:.*]] = arith.index_castui %[[A]] : index to i8
+//       CHECK:  %[[B_CASTED:.*]] = arith.index_castui %[[B]] : index to i8
+//       CHECK:  %[[RES:.*]] = arith.cmpi slt, %[[A_CASTED]], %[[B_CASTED]] : i8
+//       CHECK:  return %[[RES]] : i1
+func.func @test_cmpi() -> i1 {
+  %0 = test.with_bounds { umin = 0 : index, umax = 10 : index, smin = 0 : index, smax = 10 : index } : index
+  %1 = test.with_bounds { umin = 0 : index, umax = 10 : index, smin = 0 : index, smax = 10 : index } : index
+  %2 = arith.cmpi slt, %0, %1 : index
+  return %2 : i1
+}
+
+// CHECK-LABEL: func @test_cmpi_vec
+//       CHECK:  %[[A:.*]] = test.with_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index} : vector<4xindex>
+//       CHECK:  %[[B:.*]] = test.with_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index} : vector<4xindex>
+//       CHECK:  %[[A_CASTED:.*]] = arith.index_castui %[[A]] : vector<4xindex> to vector<4xi8>
+//       CHECK:  %[[B_CASTED:.*]] = arith.index_castui %[[B]] : vector<4xindex> to vector<4xi8>
+//       CHECK:  %[[RES:.*]] = arith.cmpi slt, %[[A_CASTED]], %[[B_CASTED]] : vector<4xi8>
+//       CHECK:  return %[[RES]] : vector<4xi1>
+func.func @test_cmpi_vec() -> vector<4xi1> {
+  %0 = test.with_bounds { umin = 0 : index, umax = 10 : index, smin = 0 : index, smax = 10 : index } : vector<4xindex>
+  %1 = test.with_bounds { umin = 0 : index, umax = 10 : index, smin = 0 : index, smax = 10 : index } : vector<4xindex>
+  %2 = arith.cmpi slt, %0, %1 : vector<4xindex>
+  return %2 : vector<4xi1>
+}
+
+// CHECK-LABEL: func @test_add_cmpi
+//       CHECK:  %[[A:.*]] = test.with_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index} : index
+//       CHECK:  %[[B:.*]] = test.with_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index} : index
+//       CHECK:  %[[C:.*]] = test.with_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index} : index
+//       CHECK:  %[[A_CASTED:.*]] = arith.index_castui %[[A]] : index to i8
+//       CHECK:  %[[B_CASTED:.*]] = arith.index_castui %[[B]] : index to i8
+//       CHECK:  %[[RES1:.*]] = arith.addi %[[A_CASTED]], %[[B_CASTED]] : i8
+//       CHECK:  %[[C_CASTED:.*]] = arith.index_castui %[[C]] : index to i8
+//       CHECK:  %[[RES2:.*]] = arith.cmpi slt, %[[C_CASTED]], %[[RES1]] : i8
+//       CHECK:  return %[[RES2]] : i1
+func.func @test_add_cmpi() -> i1 {
+  %0 = test.with_bounds { umin = 0 : index, umax = 10 : index, smin = 0 : index, smax = 10 : index } : index
+  %1 = test.with_bounds { umin = 0 : index, umax = 10 : index, smin = 0 : index, smax = 10 : index } : index
+  %3 = test.with_bounds { umin = 0 : index, umax = 10 : index, smin = 0 : index, smax = 10 : index } : index
+  %4 = arith.addi %0, %1 : index
+  %5 = arith.cmpi slt, %3, %4 : index
+  return %5 : i1
+}
+
+// CHECK-LABEL: func @test_add_cmpi_i64
+//       CHECK:  %[[A:.*]] = test.with_bounds {smax = 10 : i64, smin = 0 : i64, umax = 10 : i64, umin = 0 : i64} : i64
+//       CHECK:  %[[B:.*]] = test.with_bounds {smax = 10 : i64, smin = 0 : i64, umax = 10 : i64, umin = 0 : i64} : i64
+//       CHECK:  %[[C:.*]] = test.with_bounds {smax = 10 : i64, smin = 0 : i64, umax = 10 : i64, umin = 0 : i64} : i64
+//       CHECK:  %[[A_CASTED:.*]] = arith.trunci %[[A]] : i64 to i8
+//       CHECK:  %[[B_CASTED:.*]] = arith.trunci %[[B]] : i64 to i8
+//       CHECK:  %[[RES1:.*]] = arith.addi %[[A_CASTED]], %[[B_CASTED]] : i8
+//       CHECK:  %[[C_CASTED:.*]] = arith.trunci %[[C]] : i64 to i8
+//       CHECK:  %[[RES2:.*]] = arith.cmpi slt, %[[C_CASTED]], %[[RES1]] : i8
+//       CHECK:  return %[[RES2]] : i1
+func.func @test_add_cmpi_i64() -> i1 {
+  %0 = test.with_bounds { umin = 0 : i64, umax = 10 : i64, smin = 0 : i64, smax = 10 : i64 } : i64
+  %1 = test.with_bounds { umin = 0 : i64, umax = 10 : i64, smin = 0 : i64, smax = 10 : i64 } : i64
+  %3 = test.with_bounds { umin = 0 : i64, umax = 10 : i64, smin = 0 : i64, smax = 10 : i64 } : i64
+  %4 = arith.addi %0, %1 : i64
+  %5 = arith.cmpi slt, %3, %4 : i64
+  return %5 : i1
+}
+
+//===----------------------------------------------------------------------===//
+// arith.addi
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func.func @addi_extui_i8
+// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
+// CHECK-NEXT:    %[[EXT0:.+]] = arith.extui %[[ARG0]] : i8 to i32
+// CHECK-NEXT:    %[[EXT1:.+]] = arith.extui %[[ARG1]] : i8 to i32
+// CHECK-NEXT:    %[[LHS:.+]]  = arith.trunci %[[EXT0]] : i32 to i16
+// CHECK-NEXT:    %[[RHS:.+]]  = arith.trunci %[[EXT1]] : i32 to i16
+// CHECK-NEXT:    %[[ADD:.+]]  = arith.addi %[[LHS]], %[[RHS]] : i16
+// CHECK-NEXT:    %[[RET:.+]]  = arith.extui %[[ADD]] : i16 to i32
+// CHECK-NEXT:    return %[[RET]] : i32
+func.func @addi_extui_i8(%lhs: i8, %rhs: i8) -> i32 {
+  %a = arith.extui %lhs : i8 to i32
+  %b = arith.extui %rhs : i8 to i32
+  %r = arith.addi %a, %b : i32
+  return %r : i32
+}
+
+// This case should not get optimized because of mixed extensions.
+//
+// CHECK-LABEL: func.func @addi_mixed_ext_i8
+// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
+// CHECK-NEXT:    %[[EXT0:.+]] = arith.extsi %[[ARG0]] : i8 to i32
+// CHECK-NEXT:    %[[EXT1:.+]] = arith.extui %[[ARG1]] : i8 to i32
+// CHECK-NEXT:    %[[ADD:.+]]  = arith.addi %[[EXT0]], %[[EXT1]] : i32
+// CHECK-NEXT:    return %[[ADD]] : i32
+func.func @addi_mixed_ext_i8(%lhs: i8, %rhs: i8) -> i32 {
+  %a = arith.extsi %lhs : i8 to i32
+  %b = arith.extui %rhs : i8 to i32
+  %r = arith.addi %a, %b : i32
+  return %r : i32
+}
+
+// This case should not get optimized because we cannot reduce the bitwidth
+// below i16, given the pass options set.
+//
+// CHECK-LABEL: func.func @addi_extsi_i16
+// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
+// CHECK-NEXT:    %[[EXT0:.+]] = arith.extsi %[[ARG0]] : i8 to i16
+// CHECK-NEXT:    %[[EXT1:.+]] = arith.extsi %[[ARG1]] : i8 to i16
+// CHECK-NEXT:    %[[ADD:.+]]  = arith.addi %[[EXT0]], %[[EXT1]] : i16
+// CHECK-NEXT:    return %[[ADD]] : i16
+func.func @addi_extsi_i16(%lhs: i8, %rhs: i8) -> i16 {
+  %a = arith.extsi %lhs : i8 to i16
+  %b = arith.extsi %rhs : i8 to i16
+  %r = arith.addi %a, %b : i16
+  return %r : i16
+}
+
+//===----------------------------------------------------------------------===//
+// arith.subi
+//===----------------------------------------------------------------------===//
+
+// This patterns should only apply to `arith.subi` ops with sign-extended
+// arguments.
+//
+// CHECK-LABEL: func.func @subi_extui_i8
+// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
+// CHECK-NEXT:    %[[EXT0:.+]] = arith.extui %[[ARG0]] : i8 to i32
+// CHECK-NEXT:    %[[EXT1:.+]] = arith.extui %[[ARG1]] : i8 to i32
+// CHECK-NEXT:    %[[SUB:.+]]  = arith.subi %[[EXT0]], %[[EXT1]] : i32
+// CHECK-NEXT:    return %[[SUB]] : i32
+func.func @subi_extui_i8(%lhs: i8, %rhs: i8) -> i32 {
+  %a = arith.extui %lhs : i8 to i32
+  %b = arith.extui %rhs : i8 to i32
+  %r = arith.subi %a, %b : i32
+  return %r : i32
+}
+
+// This case should not get optimized because of mixed extensions.
+//
+// CHECK-LABEL: func.func @subi_mixed_ext_i8
+// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
+// CHECK-NEXT:    %[[EXT0:.+]] = arith.extsi %[[ARG0]] : i8 to i32
+// CHECK-NEXT:    %[[EXT1:.+]] = arith.extui %[[ARG1]] : i8 to i32
+// CHECK-NEXT:    %[[ADD:.+]]  = arith.subi %[[EXT0]], %[[EXT1]] : i32
+// CHECK-NEXT:    return %[[ADD]] : i32
+func.func @subi_mixed_ext_i8(%lhs: i8, %rhs: i8) -> i32 {
+  %a = arith.extsi %lhs : i8 to i32
+  %b = arith.extui %rhs : i8 to i32
+  %r = arith.subi %a, %b : i32
+  return %r : i32
+}
+
+//===----------------------------------------------------------------------===//
+// arith.muli
+//===----------------------------------------------------------------------===//
+
+// TODO: This should be optimized into i16
+// CHECK-LABEL: func.func @muli_extui_i8
+// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
+// CHECK-NEXT:    %[[EXT0:.+]] = arith.extui %[[ARG0]] : i8 to i32
+// CHECK-NEXT:    %[[EXT1:.+]] = arith.extui %[[ARG1]] : i8 to i32
+// CHECK-NEXT:    %[[LHS:.+]]  = arith.trunci %[[EXT0]] : i32 to i24
+// CHECK-NEXT:    %[[RHS:.+]]  = arith.trunci %[[EXT1]] : i32 to i24
+// CHECK-NEXT:    %[[MUL:.+]]  = arith.muli %[[LHS]], %[[RHS]] : i24
+// CHECK-NEXT:    %[[RET:.+]]  = arith.extui %[[MUL]] : i24 to i32
+// CHECK-NEXT:    return %[[RET]] : i32
+func.func @muli_extui_i8(%lhs: i8, %rhs: i8) -> i32 {
+  %a = arith.extui %lhs : i8 to i32
+  %b = arith.extui %rhs : i8 to i32
+  %r = arith.muli %a, %b : i32
+  return %r : i32
+}
+
+// We do not expect this case to be optimized because given n-bit operands,
+// arith.muli produces 2n bits of result.
+//
+// CHECK-LABEL: func.func @muli_extsi_i32
+// CHECK-SAME:    (%[[ARG0:.+]]: i16, %[[ARG1:.+]]: i16)
+// CHECK-NEXT:    %[[LHS:.+]]  = arith.extsi %[[ARG0]] : i16 to i32
+// CHECK-NEXT:    %[[RHS:.+]]  = arith.extsi %[[ARG1]] : i16 to i32
+// CHECK-NEXT:    %[[RET:.+]]  = arith.muli %[[LHS]], %[[RHS]] : i32
+// CHECK-NEXT:    return %[[RET]] : i32
+func.func @muli_extsi_i32(%lhs: i16, %rhs: i16) -> i32 {
+  %a = arith.extsi %lhs : i16 to i32
+  %b = arith.extsi %rhs : i16 to i32
+  %r = arith.muli %a, %b : i32
+  return %r : i32
+}
+
+// This case should not get optimized because of mixed extensions.
+//
+// CHECK-LABEL: func.func @muli_mixed_ext_i8
+// CHECK-SAME:    (%[[ARG0:.+]]: i8, %[[ARG1:.+]]: i8)
+// CHECK-NEXT:    %[[EXT0:.+]] = arith.extsi %[[ARG0]] : i8 to i32
+// CHECK-NEXT:    %[[EXT1:.+]] = arith.extui %[[ARG1]] : i8 to i32
+// CHECK-NEXT:    %[[MUL:.+]]  = arith.muli %[[EXT0]], %[[EXT1]] : i32
+// CHECK-NEXT:    return %[[MUL]] : i32
+func.func @muli_mixed_ext_i8(%lhs: i8, %rhs: i8) -> i32 {
+  %a = arith.extsi %lhs : i8 to i32
+  %b = arith.extui %rhs : i8 to i32
+  %r = arith.muli %a, %b : i32
+  return %r : i32
+}
diff --git a/mlir/test/Dialect/Linalg/int-narrowing.mlir b/mlir/test/Dialect/Linalg/int-narrowing.mlir
deleted file mode 100644
index 8063d50..0000000
--- a/mlir/test/Dialect/Linalg/int-narrowing.mlir
+++ /dev/null
@@ -1,147 +0,0 @@
-// RUN: mlir-opt --arith-int-narrowing="int-bitwidths-supported=1,8,16,32" \
-// RUN:          --verify-diagnostics %s | FileCheck %s
-
-// Check that we can calculate `linalg.index` value bounds and use them to
-// optimize index casts.
-
-//===----------------------------------------------------------------------===//
-// arith.index_cast
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: func @linalg_indexcast_dim_0_i8
-// CHECK:         %[[IDX:.+]] = linalg.index 0 : index
-// CHECK-NEXT:    %[[INT:.+]] = arith.index_cast %[[IDX]] : index to i8
-// CHECK-NEXT:    %[[FP:.+]]  = arith.sitofp %[[INT]] : i8 to f16
-// CHECK-NEXT:    linalg.yield %[[FP]] : f16
-func.func @linalg_indexcast_dim_0_i8(%arg0: tensor<f16>) -> tensor<128xf16> {
-  %init = tensor.empty() : tensor<128xf16>
-  %res = linalg.generic {
-      indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>],
-      iterator_types = ["parallel"]
-    }
-    ins(%arg0 : tensor<f16>)
-    outs(%init : tensor<128xf16>) {
-  ^bb0(%in: f16, %out: f16):
-    %idx = linalg.index 0 : index
-    %int = arith.index_cast %idx : index to i64
-    %fp = arith.sitofp %int : i64 to f16
-    linalg.yield %fp : f16
-  } -> tensor<128xf16>
-
-  return %res : tensor<128xf16>
-}
-
-// CHECK-LABEL: func @linalg_indexcast_dim_1_i16
-// CHECK:         %[[IDX:.+]] = linalg.index 1 : index
-// CHECK-NEXT:    %[[INT:.+]] = arith.index_cast %[[IDX]] : index to i16
-// CHECK-NEXT:    %[[FP:.+]]  = arith.sitofp %[[INT]] : i16 to f16
-// CHECK-NEXT:    linalg.yield %[[FP]] : f16
-func.func @linalg_indexcast_dim_1_i16(%arg0: tensor<f16>, %arg1: tensor<?x129xf16>) -> tensor<?x129xf16> {
-  %res = linalg.generic {
-      indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>],
-      iterator_types = ["parallel", "parallel"]
-    }
-    ins(%arg0 : tensor<f16>)
-    outs(%arg1 : tensor<?x129xf16>) {
-  ^bb0(%in: f16, %out: f16):
-    %idx = linalg.index 1 : index
-    %int = arith.index_cast %idx : index to i64
-    %fp = arith.sitofp %int : i64 to f16
-    linalg.yield %fp : f16
-  } -> tensor<?x129xf16>
-
-  return %res : tensor<?x129xf16>
-}
-
-// CHECK-LABEL: func @linalg_indexcast_dynamic_dim_i64
-// CHECK:         %[[IDX:.+]] = linalg.index 0 : index
-// CHECK-NEXT:    %[[INT:.+]] = arith.index_cast %[[IDX]] : index to i64
-// CHECK-NEXT:    %[[FP:.+]]  = arith.sitofp %[[INT]] : i64 to f16
-// CHECK-NEXT:    linalg.yield %[[FP]] : f16
-func.func @linalg_indexcast_dynamic_dim_i64(%arg0: tensor<f16>, %arg1: tensor<?xf16>) -> tensor<?xf16> {
-  %res = linalg.generic {
-      indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>],
-      iterator_types = ["parallel"]
-    }
-    ins(%arg0 : tensor<f16>)
-    outs(%arg1 : tensor<?xf16>) {
-  ^bb0(%in: f16, %out: f16):
-    %idx = linalg.index 0 : index
-    %int = arith.index_cast %idx : index to i64
-    %fp = arith.sitofp %int : i64 to f16
-    linalg.yield %fp : f16
-  } -> tensor<?xf16>
-
-  return %res : tensor<?xf16>
-}
-
-//===----------------------------------------------------------------------===//
-// arith.index_castui
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: func @linalg_indexcastui_dim_0_i8
-// CHECK:         %[[IDX:.+]] = linalg.index 0 : index
-// CHECK-NEXT:    %[[INT:.+]] = arith.index_castui %[[IDX]] : index to i8
-// CHECK-NEXT:    %[[FP:.+]]  = arith.uitofp %[[INT]] : i8 to f16
-// CHECK-NEXT:    linalg.yield %[[FP]] : f16
-func.func @linalg_indexcastui_dim_0_i8(%arg0: tensor<f16>) -> tensor<256xf16> {
-  %init = tensor.empty() : tensor<256xf16>
-  %res = linalg.generic {
-      indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>],
-      iterator_types = ["parallel"]
-    }
-    ins(%arg0 : tensor<f16>)
-    outs(%init : tensor<256xf16>) {
-  ^bb0(%in: f16, %out: f16):
-    %idx = linalg.index 0 : index
-    %int = arith.index_castui %idx : index to i64
-    %fp = arith.uitofp %int : i64 to f16
-    linalg.yield %fp : f16
-  } -> tensor<256xf16>
-
-  return %res : tensor<256xf16>
-}
-
-// CHECK-LABEL: func @linalg_indexcastui_dim_1_i16
-// CHECK:         %[[IDX:.+]] = linalg.index 1 : index
-// CHECK-NEXT:    %[[INT:.+]] = arith.index_castui %[[IDX]] : index to i16
-// CHECK-NEXT:    %[[FP:.+]]  = arith.uitofp %[[INT]] : i16 to f16
-// CHECK-NEXT:    linalg.yield %[[FP]] : f16
-func.func @linalg_indexcastui_dim_1_i16(%arg0: tensor<f16>, %arg1: tensor<?x257xf16>) -> tensor<?x257xf16> {
-  %res = linalg.generic {
-      indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>],
-      iterator_types = ["parallel", "parallel"]
-    }
-    ins(%arg0 : tensor<f16>)
-    outs(%arg1 : tensor<?x257xf16>) {
-  ^bb0(%in: f16, %out: f16):
-    %idx = linalg.index 1 : index
-    %int = arith.index_castui %idx : index to i64
-    %fp = arith.uitofp %int : i64 to f16
-    linalg.yield %fp : f16
-  } -> tensor<?x257xf16>
-
-  return %res : tensor<?x257xf16>
-}
-
-// CHECK-LABEL: func @linalg_indexcastui_dynamic_dim_i64
-// CHECK:         %[[IDX:.+]] = linalg.index 0 : index
-// CHECK-NEXT:    %[[INT:.+]] = arith.index_castui %[[IDX]] : index to i64
-// CHECK-NEXT:    %[[FP:.+]]  = arith.uitofp %[[INT]] : i64 to f16
-// CHECK-NEXT:    linalg.yield %[[FP]] : f16
-func.func @linalg_indexcastui_dynamic_dim_i64(%arg0: tensor<f16>, %arg1: tensor<?xf16>) -> tensor<?xf16> {
-  %res = linalg.generic {
-      indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>],
-      iterator_types = ["parallel"]
-    }
-    ins(%arg0 : tensor<f16>)
-    outs(%arg1 : tensor<?xf16>) {
-  ^bb0(%in: f16, %out: f16):
-    %idx = linalg.index 0 : index
-    %int = arith.index_castui %idx : index to i64
-    %fp = arith.uitofp %int : i64 to f16
-    linalg.yield %fp : f16
-  } -> tensor<?xf16>
-
-  return %res : tensor<?xf16>
-}
diff --git a/mlir/test/Dialect/Math/polynomial-approximation.mlir b/mlir/test/Dialect/Math/polynomial-approximation.mlir
index 93ecd67..81d071e 100644
--- a/mlir/test/Dialect/Math/polynomial-approximation.mlir
+++ b/mlir/test/Dialect/Math/polynomial-approximation.mlir
@@ -894,6 +894,47 @@ func.func @math_f16(%arg0 : vector<4xf16>) -> vector<4xf16> {
   return %11 : vector<4xf16>
 }
 
+// CHECK-LABEL: @math_zero_rank
+func.func @math_zero_rank(%arg0 : vector<f16>) -> vector<f16> {
+
+  // CHECK-NOT: math.atan
+  %0 = "math.atan"(%arg0) : (vector<f16>) -> vector<f16>
+
+  // CHECK-NOT: math.atan2
+  %1 = "math.atan2"(%0, %arg0) : (vector<f16>, vector<f16>) -> vector<f16>
+
+  // CHECK-NOT: math.tanh
+  %2 = "math.tanh"(%1) : (vector<f16>) -> vector<f16>
+
+  // CHECK-NOT: math.log
+  %3 = "math.log"(%2) : (vector<f16>) -> vector<f16>
+
+  // CHECK-NOT: math.log2
+  %4 = "math.log2"(%3) : (vector<f16>) -> vector<f16>
+
+  // CHECK-NOT: math.log1p
+  %5 = "math.log1p"(%4) : (vector<f16>) -> vector<f16>
+
+  // CHECK-NOT: math.erf
+  %6 = "math.erf"(%5) : (vector<f16>) -> vector<f16>
+
+  // CHECK-NOT: math.exp
+  %7 = "math.exp"(%6) : (vector<f16>) -> vector<f16>
+
+  // CHECK-NOT: math.expm1
+  %8 = "math.expm1"(%7) : (vector<f16>) -> vector<f16>
+
+  // CHECK-NOT: math.cbrt
+  %9 = "math.cbrt"(%8) : (vector<f16>) -> vector<f16>
+
+  // CHECK-NOT: math.sin
+  %10 = "math.sin"(%9) : (vector<f16>) -> vector<f16>
+
+  // CHECK-NOT: math.cos
+  %11 = "math.cos"(%10) : (vector<f16>) -> vector<f16>
+
+  return %11 : vector<f16>
+}
 
 // AVX2-LABEL: @rsqrt_f16
 func.func @rsqrt_f16(%arg0 : vector<2x8xf16>) -> vector<2x8xf16> {
diff --git a/mlir/test/Dialect/OpenACC/invalid.mlir b/mlir/test/Dialect/OpenACC/invalid.mlir
index 96edb58..0cc65f7 100644
--- a/mlir/test/Dialect/OpenACC/invalid.mlir
+++ b/mlir/test/Dialect/OpenACC/invalid.mlir
@@ -611,7 +611,7 @@ func.func @acc_atomic_update(%x: memref<i32>, %expr: i32) {
 func.func @acc_atomic_capture(%x: memref<i32>, %v: memref<i32>, %expr: i32) {
   // expected-error @below {{expected three operations in atomic.capture region}}
   acc.atomic.capture {
-    acc.atomic.read %v = %x : memref<i32>, i32
+    acc.atomic.read %v = %x : memref<i32>, memref<i32>, i32
     acc.terminator
   }
   return
@@ -622,8 +622,8 @@ func.func @acc_atomic_capture(%x: memref<i32>, %v: memref<i32>, %expr: i32) {
 func.func @acc_atomic_capture(%x: memref<i32>, %v: memref<i32>, %expr: i32) {
   acc.atomic.capture {
     // expected-error @below {{invalid sequence of operations in the capture region}}
-    acc.atomic.read %v = %x : memref<i32>, i32
-    acc.atomic.read %v = %x : memref<i32>, i32
+    acc.atomic.read %v = %x : memref<i32>, memref<i32>, i32
+    acc.atomic.read %v = %x : memref<i32>, memref<i32>, i32
     acc.terminator
   }
   return
@@ -699,7 +699,7 @@ func.func @acc_atomic_capture(%x: memref<i32>, %v: memref<i32>, %expr: i32) {
   acc.atomic.capture {
     // expected-error @below {{invalid sequence of operations in the capture region}}
     acc.atomic.write %x = %expr : memref<i32>, i32
-    acc.atomic.read %v = %x : memref<i32>, i32
+    acc.atomic.read %v = %x : memref<i32>, memref<i32>, i32
     acc.terminator
   }
   return
@@ -715,7 +715,7 @@ func.func @acc_atomic_capture(%x: memref<i32>, %y: memref<i32>, %v: memref<i32>,
       %newval = llvm.add %xval, %expr : i32
       acc.yield %newval : i32
     }
-    acc.atomic.read %v = %y : memref<i32>, i32
+    acc.atomic.read %v = %y : memref<i32>, memref<i32>, i32
     acc.terminator
   }
 }
@@ -725,7 +725,7 @@ func.func @acc_atomic_capture(%x: memref<i32>, %y: memref<i32>, %v: memref<i32>,
 func.func @acc_atomic_capture(%x: memref<i32>, %y: memref<i32>, %v: memref<i32>, %expr: i32) {
   acc.atomic.capture {
     // expected-error @below {{captured variable in atomic.read must be updated in second operation}}
-    acc.atomic.read %v = %y : memref<i32>, i32
+    acc.atomic.read %v = %y : memref<i32>, memref<i32>, i32
     acc.atomic.update %x : memref<i32> {
     ^bb0(%xval: i32):
       %newval = llvm.add %xval, %expr : i32
@@ -740,7 +740,7 @@ func.func @acc_atomic_capture(%x: memref<i32>, %y: memref<i32>, %v: memref<i32>,
 func.func @acc_atomic_capture(%x: memref<i32>, %y: memref<i32>, %v: memref<i32>, %expr: i32) {
   acc.atomic.capture {
     // expected-error @below {{captured variable in atomic.read must be updated in second operation}}
-    acc.atomic.read %v = %x : memref<i32>, i32
+    acc.atomic.read %v = %x : memref<i32>, memref<i32>, i32
     acc.atomic.write %y = %expr : memref<i32>, i32
     acc.terminator
   }
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index 2ef2178..3ed81b5 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -1703,8 +1703,8 @@ acc.set default_async(%i32Value : i32)
 // CHECK-LABEL: func.func @acc_atomic_read
 // CHECK-SAME: (%[[v:.*]]: memref<i32>, %[[x:.*]]: memref<i32>)
 func.func @acc_atomic_read(%v: memref<i32>, %x: memref<i32>) {
-  // CHECK: acc.atomic.read %[[v]] = %[[x]] : memref<i32>, i32
-  acc.atomic.read %v = %x : memref<i32>, i32
+  // CHECK: acc.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
+  acc.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   return
 }
 
@@ -1806,7 +1806,7 @@ func.func @acc_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
   // CHECK-NEXT:   %[[newval:.*]] = llvm.add %[[xval]], %[[expr]] : i32
   // CHECK-NEXT:   acc.yield %[[newval]] : i32
   // CHECK-NEXT: }
-  // CHECK-NEXT: acc.atomic.read %[[v]] = %[[x]] : memref<i32>, i32
+  // CHECK-NEXT: acc.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
   // CHECK-NEXT: }
   acc.atomic.capture {
     acc.atomic.update %x : memref<i32> {
@@ -1814,10 +1814,10 @@ func.func @acc_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       acc.yield %newval : i32
     }
-    acc.atomic.read %v = %x : memref<i32>, i32
+    acc.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   }
   // CHECK: acc.atomic.capture {
-  // CHECK-NEXT: acc.atomic.read %[[v]] = %[[x]] : memref<i32>, i32
+  // CHECK-NEXT: acc.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
   // CHECK-NEXT: acc.atomic.update %[[x]] : memref<i32>
   // CHECK-NEXT: (%[[xval:.*]]: i32):
   // CHECK-NEXT:   %[[newval:.*]] = llvm.add %[[xval]], %[[expr]] : i32
@@ -1825,7 +1825,7 @@ func.func @acc_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
   // CHECK-NEXT: }
   // CHECK-NEXT: }
   acc.atomic.capture {
-    acc.atomic.read %v = %x : memref<i32>, i32
+    acc.atomic.read %v = %x : memref<i32>, memref<i32>, i32
     acc.atomic.update %x : memref<i32> {
     ^bb0(%xval: i32):
       %newval = llvm.add %xval, %expr : i32
@@ -1833,11 +1833,11 @@ func.func @acc_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
     }
   }
   // CHECK: acc.atomic.capture {
-  // CHECK-NEXT: acc.atomic.read %[[v]] = %[[x]] : memref<i32>, i32
+  // CHECK-NEXT: acc.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
   // CHECK-NEXT: acc.atomic.write %[[x]] = %[[expr]] : memref<i32>, i32
   // CHECK-NEXT: }
   acc.atomic.capture {
-    acc.atomic.read %v = %x : memref<i32>, i32
+    acc.atomic.read %v = %x : memref<i32>, memref<i32>, i32
     acc.atomic.write %x = %expr : memref<i32>, i32
   }
 
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index fd89ec3..db941d40 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -891,7 +891,7 @@ func.func @omp_ordered5(%arg1 : i32, %arg2 : i32, %arg3 : i32, %vec0 : i64, %vec
 
 func.func @omp_atomic_read1(%x: memref<i32>, %v: memref<i32>) {
   // expected-error @below {{the hints omp_sync_hint_nonspeculative and omp_sync_hint_speculative cannot be combined.}}
-  omp.atomic.read %v = %x hint(speculative, nonspeculative) : memref<i32>, i32
+  omp.atomic.read %v = %x hint(speculative, nonspeculative) : memref<i32>, memref<i32>, i32
   return
 }
 
@@ -899,7 +899,7 @@ func.func @omp_atomic_read1(%x: memref<i32>, %v: memref<i32>) {
 
 func.func @omp_atomic_read2(%x: memref<i32>, %v: memref<i32>) {
   // expected-error @below {{invalid clause value: 'xyz'}}
-  omp.atomic.read %v = %x memory_order(xyz) : memref<i32>, i32
+  omp.atomic.read %v = %x memory_order(xyz) : memref<i32>, memref<i32>, i32
   return
 }
 
@@ -907,7 +907,7 @@ func.func @omp_atomic_read2(%x: memref<i32>, %v: memref<i32>) {
 
 func.func @omp_atomic_read3(%x: memref<i32>, %v: memref<i32>) {
   // expected-error @below {{memory-order must not be acq_rel or release for atomic reads}}
-  omp.atomic.read %v = %x memory_order(acq_rel) : memref<i32>, i32
+  omp.atomic.read %v = %x memory_order(acq_rel) : memref<i32>, memref<i32>, i32
   return
 }
 
@@ -915,7 +915,7 @@ func.func @omp_atomic_read3(%x: memref<i32>, %v: memref<i32>) {
 
 func.func @omp_atomic_read4(%x: memref<i32>, %v: memref<i32>) {
   // expected-error @below {{memory-order must not be acq_rel or release for atomic reads}}
-  omp.atomic.read %v = %x memory_order(release) : memref<i32>, i32
+  omp.atomic.read %v = %x memory_order(release) : memref<i32>, memref<i32>, i32
   return
 }
 
@@ -923,7 +923,7 @@ func.func @omp_atomic_read4(%x: memref<i32>, %v: memref<i32>) {
 
 func.func @omp_atomic_read5(%x: memref<i32>, %v: memref<i32>) {
   // expected-error @below {{`memory_order` clause can appear at most once in the expansion of the oilist directive}}
-  omp.atomic.read %v = %x memory_order(acquire) memory_order(relaxed) : memref<i32>, i32
+  omp.atomic.read %v = %x memory_order(acquire) memory_order(relaxed) : memref<i32>, memref<i32>, i32
   return
 }
 
@@ -931,7 +931,7 @@ func.func @omp_atomic_read5(%x: memref<i32>, %v: memref<i32>) {
 
 func.func @omp_atomic_read6(%x: memref<i32>, %v: memref<i32>) {
   // expected-error @below {{`hint` clause can appear at most once in the expansion of the oilist directive}}
-  omp.atomic.read %v =  %x hint(speculative) hint(contended) : memref<i32>, i32
+  omp.atomic.read %v =  %x hint(speculative) hint(contended) : memref<i32>, memref<i32>, i32
   return
 }
 
@@ -939,7 +939,7 @@ func.func @omp_atomic_read6(%x: memref<i32>, %v: memref<i32>) {
 
 func.func @omp_atomic_read6(%x: memref<i32>, %v: memref<i32>) {
   // expected-error @below {{read and write must not be to the same location for atomic reads}}
-  omp.atomic.read %x =  %x hint(speculative) : memref<i32>, i32
+  omp.atomic.read %x =  %x hint(speculative) : memref<i32>, memref<i32>, i32
   return
 }
 
@@ -1137,7 +1137,7 @@ func.func @omp_atomic_update(%x: memref<i32>, %expr: i32) {
 func.func @omp_atomic_capture(%x: memref<i32>, %v: memref<i32>, %expr: i32) {
   // expected-error @below {{expected three operations in atomic.capture region}}
   omp.atomic.capture {
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
     omp.terminator
   }
   return
@@ -1148,8 +1148,8 @@ func.func @omp_atomic_capture(%x: memref<i32>, %v: memref<i32>, %expr: i32) {
 func.func @omp_atomic_capture(%x: memref<i32>, %v: memref<i32>, %expr: i32) {
   omp.atomic.capture {
     // expected-error @below {{invalid sequence of operations in the capture region}}
-    omp.atomic.read %v = %x : memref<i32>, i32
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
     omp.terminator
   }
   return
@@ -1225,7 +1225,7 @@ func.func @omp_atomic_capture(%x: memref<i32>, %v: memref<i32>, %expr: i32) {
   omp.atomic.capture {
     // expected-error @below {{invalid sequence of operations in the capture region}}
     omp.atomic.write %x = %expr : memref<i32>, i32
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
     omp.terminator
   }
   return
@@ -1241,7 +1241,7 @@ func.func @omp_atomic_capture(%x: memref<i32>, %y: memref<i32>, %v: memref<i32>,
       %newval = llvm.add %xval, %expr : i32
       omp.yield (%newval : i32)
     }
-    omp.atomic.read %v = %y : memref<i32>, i32
+    omp.atomic.read %v = %y : memref<i32>, memref<i32>, i32
     omp.terminator
   }
 }
@@ -1251,7 +1251,7 @@ func.func @omp_atomic_capture(%x: memref<i32>, %y: memref<i32>, %v: memref<i32>,
 func.func @omp_atomic_capture(%x: memref<i32>, %y: memref<i32>, %v: memref<i32>, %expr: i32) {
   omp.atomic.capture {
     // expected-error @below {{captured variable in atomic.read must be updated in second operation}}
-    omp.atomic.read %v = %y : memref<i32>, i32
+    omp.atomic.read %v = %y : memref<i32>, memref<i32>, i32
     omp.atomic.update %x : memref<i32> {
     ^bb0(%xval: i32):
       %newval = llvm.add %xval, %expr : i32
@@ -1266,7 +1266,7 @@ func.func @omp_atomic_capture(%x: memref<i32>, %y: memref<i32>, %v: memref<i32>,
 func.func @omp_atomic_capture(%x: memref<i32>, %y: memref<i32>, %v: memref<i32>, %expr: i32) {
   omp.atomic.capture {
     // expected-error @below {{captured variable in atomic.read must be updated in second operation}}
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
     omp.atomic.write %y = %expr : memref<i32>, i32
     omp.terminator
   }
@@ -1282,7 +1282,7 @@ func.func @omp_atomic_capture(%x: memref<i32>, %v: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   }
   return
 }
@@ -1297,7 +1297,7 @@ func.func @omp_atomic_capture(%x: memref<i32>, %v: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   }
   return
 }
@@ -1312,7 +1312,7 @@ func.func @omp_atomic_capture(%x: memref<i32>, %v: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   }
   return
 }
@@ -1327,7 +1327,7 @@ func.func @omp_atomic_capture(%x: memref<i32>, %v: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   }
   return
 }
@@ -1342,7 +1342,7 @@ func.func @omp_atomic_capture(%x: memref<i32>, %v: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   }
   return
 }
@@ -1357,7 +1357,7 @@ func.func @omp_atomic_capture(%x: memref<i32>, %v: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x memory_order(seq_cst) : memref<i32>, i32
+    omp.atomic.read %v = %x memory_order(seq_cst) : memref<i32>, memref<i32>, i32
   }
   return
 }
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 6f11b45..b606f9e 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -1282,20 +1282,20 @@ func.func @omp_ordered(%arg1 : i32, %arg2 : i32, %arg3 : i32,
 // CHECK-LABEL: omp_atomic_read
 // CHECK-SAME: (%[[v:.*]]: memref<i32>, %[[x:.*]]: memref<i32>)
 func.func @omp_atomic_read(%v: memref<i32>, %x: memref<i32>) {
-  // CHECK: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, i32
-  omp.atomic.read %v = %x : memref<i32>, i32
-  // CHECK: omp.atomic.read %[[v]] = %[[x]] memory_order(seq_cst) : memref<i32>, i32
-  omp.atomic.read %v = %x memory_order(seq_cst) : memref<i32>, i32
-  // CHECK: omp.atomic.read %[[v]] = %[[x]] memory_order(acquire) : memref<i32>, i32
-  omp.atomic.read %v = %x memory_order(acquire) : memref<i32>, i32
-  // CHECK: omp.atomic.read %[[v]] = %[[x]] memory_order(relaxed) : memref<i32>, i32
-  omp.atomic.read %v = %x memory_order(relaxed) : memref<i32>, i32
-  // CHECK: omp.atomic.read %[[v]] = %[[x]] hint(contended, nonspeculative) : memref<i32>, i32
-  omp.atomic.read %v = %x hint(nonspeculative, contended) : memref<i32>, i32
-  // CHECK: omp.atomic.read %[[v]] = %[[x]] hint(contended, speculative) memory_order(seq_cst) : memref<i32>, i32
-  omp.atomic.read %v = %x hint(speculative, contended) memory_order(seq_cst) : memref<i32>, i32
-  // CHECK: omp.atomic.read %[[v]] = %[[x]] memory_order(seq_cst) : memref<i32>, i32
-  omp.atomic.read %v = %x hint(none) memory_order(seq_cst) : memref<i32>, i32
+  // CHECK: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
+  omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
+  // CHECK: omp.atomic.read %[[v]] = %[[x]] memory_order(seq_cst) : memref<i32>, memref<i32>, i32
+  omp.atomic.read %v = %x memory_order(seq_cst) : memref<i32>, memref<i32>, i32
+  // CHECK: omp.atomic.read %[[v]] = %[[x]] memory_order(acquire) : memref<i32>, memref<i32>, i32
+  omp.atomic.read %v = %x memory_order(acquire) : memref<i32>, memref<i32>, i32
+  // CHECK: omp.atomic.read %[[v]] = %[[x]] memory_order(relaxed) : memref<i32>, memref<i32>, i32
+  omp.atomic.read %v = %x memory_order(relaxed) : memref<i32>, memref<i32>, i32
+  // CHECK: omp.atomic.read %[[v]] = %[[x]] hint(contended, nonspeculative) : memref<i32>, memref<i32>, i32
+  omp.atomic.read %v = %x hint(nonspeculative, contended) : memref<i32>, memref<i32>, i32
+  // CHECK: omp.atomic.read %[[v]] = %[[x]] hint(contended, speculative) memory_order(seq_cst) : memref<i32>, memref<i32>, i32
+  omp.atomic.read %v = %x hint(speculative, contended) memory_order(seq_cst) : memref<i32>, memref<i32>, i32
+  // CHECK: omp.atomic.read %[[v]] = %[[x]] memory_order(seq_cst) : memref<i32>, memref<i32>, i32
+  omp.atomic.read %v = %x hint(none) memory_order(seq_cst) : memref<i32>, memref<i32>, i32
   return
 }
 
@@ -1531,7 +1531,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
   // CHECK-NEXT:   %[[newval:.*]] = llvm.add %[[xval]], %[[expr]] : i32
   // CHECK-NEXT:   omp.yield(%[[newval]] : i32)
   // CHECK-NEXT: }
-  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, i32
+  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
   // CHECK-NEXT: }
   omp.atomic.capture{
     omp.atomic.update %x : memref<i32> {
@@ -1539,10 +1539,10 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   }
   // CHECK: omp.atomic.capture {
-  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, i32
+  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
   // CHECK-NEXT: omp.atomic.update %[[x]] : memref<i32>
   // CHECK-NEXT: (%[[xval:.*]]: i32):
   // CHECK-NEXT:   %[[newval:.*]] = llvm.add %[[xval]], %[[expr]] : i32
@@ -1550,7 +1550,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
   // CHECK-NEXT: }
   // CHECK-NEXT: }
   omp.atomic.capture{
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
     omp.atomic.update %x : memref<i32> {
     ^bb0(%xval: i32):
       %newval = llvm.add %xval, %expr : i32
@@ -1558,11 +1558,11 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
     }
   }
   // CHECK: omp.atomic.capture {
-  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, i32
+  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
   // CHECK-NEXT: omp.atomic.write %[[x]] = %[[expr]] : memref<i32>, i32
   // CHECK-NEXT: }
   omp.atomic.capture{
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
     omp.atomic.write %x = %expr : memref<i32>, i32
   }
 
@@ -1572,7 +1572,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
   // CHECK-NEXT:   %[[newval:.*]] = llvm.add %[[xval]], %[[expr]] : i32
   // CHECK-NEXT:   omp.yield(%[[newval]] : i32)
   // CHECK-NEXT: }
-  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, i32
+  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
   // CHECK-NEXT: }
   omp.atomic.capture hint(none) {
     omp.atomic.update %x : memref<i32> {
@@ -1580,7 +1580,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   }
 
   // CHECK: omp.atomic.capture hint(uncontended) {
@@ -1589,7 +1589,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
   // CHECK-NEXT:   %[[newval:.*]] = llvm.add %[[xval]], %[[expr]] : i32
   // CHECK-NEXT:   omp.yield(%[[newval]] : i32)
   // CHECK-NEXT: }
-  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, i32
+  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
   // CHECK-NEXT: }
   omp.atomic.capture hint(uncontended) {
     omp.atomic.update %x : memref<i32> {
@@ -1597,7 +1597,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   }
 
   // CHECK: omp.atomic.capture hint(contended) {
@@ -1606,7 +1606,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
   // CHECK-NEXT:   %[[newval:.*]] = llvm.add %[[xval]], %[[expr]] : i32
   // CHECK-NEXT:   omp.yield(%[[newval]] : i32)
   // CHECK-NEXT: }
-  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, i32
+  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
   // CHECK-NEXT: }
   omp.atomic.capture hint(contended) {
     omp.atomic.update %x : memref<i32> {
@@ -1614,7 +1614,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   }
 
   // CHECK: omp.atomic.capture hint(nonspeculative) {
@@ -1623,7 +1623,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
   // CHECK-NEXT:   %[[newval:.*]] = llvm.add %[[xval]], %[[expr]] : i32
   // CHECK-NEXT:   omp.yield(%[[newval]] : i32)
   // CHECK-NEXT: }
-  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, i32
+  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
   // CHECK-NEXT: }
   omp.atomic.capture hint(nonspeculative) {
     omp.atomic.update %x : memref<i32> {
@@ -1631,7 +1631,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   }
 
   // CHECK: omp.atomic.capture hint(speculative) {
@@ -1640,7 +1640,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
   // CHECK-NEXT:   %[[newval:.*]] = llvm.add %[[xval]], %[[expr]] : i32
   // CHECK-NEXT:   omp.yield(%[[newval]] : i32)
   // CHECK-NEXT: }
-  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, i32
+  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
   // CHECK-NEXT: }
   omp.atomic.capture hint(speculative) {
     omp.atomic.update %x : memref<i32> {
@@ -1648,7 +1648,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   }
 
   // CHECK: omp.atomic.capture hint(uncontended, nonspeculative) {
@@ -1657,7 +1657,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
   // CHECK-NEXT:   %[[newval:.*]] = llvm.add %[[xval]], %[[expr]] : i32
   // CHECK-NEXT:   omp.yield(%[[newval]] : i32)
   // CHECK-NEXT: }
-  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, i32
+  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
   // CHECK-NEXT: }
   omp.atomic.capture hint(uncontended, nonspeculative) {
     omp.atomic.update %x : memref<i32> {
@@ -1665,7 +1665,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   }
 
   // CHECK: omp.atomic.capture hint(contended, nonspeculative) {
@@ -1674,7 +1674,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
   // CHECK-NEXT:   %[[newval:.*]] = llvm.add %[[xval]], %[[expr]] : i32
   // CHECK-NEXT:   omp.yield(%[[newval]] : i32)
   // CHECK-NEXT: }
-  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, i32
+  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
   // CHECK-NEXT: }
   omp.atomic.capture hint(contended, nonspeculative) {
     omp.atomic.update %x : memref<i32> {
@@ -1682,7 +1682,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   }
 
   // CHECK: omp.atomic.capture hint(uncontended, speculative) {
@@ -1691,7 +1691,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
   // CHECK-NEXT:   %[[newval:.*]] = llvm.add %[[xval]], %[[expr]] : i32
   // CHECK-NEXT:   omp.yield(%[[newval]] : i32)
   // CHECK-NEXT: }
-  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, i32
+  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
   // CHECK-NEXT: }
   omp.atomic.capture hint(uncontended, speculative) {
     omp.atomic.update %x : memref<i32> {
@@ -1699,7 +1699,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   }
 
   // CHECK: omp.atomic.capture hint(contended, speculative) {
@@ -1708,7 +1708,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
   // CHECK-NEXT:   %[[newval:.*]] = llvm.add %[[xval]], %[[expr]] : i32
   // CHECK-NEXT:   omp.yield(%[[newval]] : i32)
   // CHECK-NEXT: }
-  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>
+  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
   // CHECK-NEXT: }
   omp.atomic.capture hint(contended, speculative) {
     omp.atomic.update %x : memref<i32> {
@@ -1716,7 +1716,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   }
 
   // CHECK: omp.atomic.capture memory_order(seq_cst) {
@@ -1725,7 +1725,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
   // CHECK-NEXT:   %[[newval:.*]] = llvm.add %[[xval]], %[[expr]] : i32
   // CHECK-NEXT:   omp.yield(%[[newval]] : i32)
   // CHECK-NEXT: }
-  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>
+  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
   // CHECK-NEXT: }
   omp.atomic.capture memory_order(seq_cst) {
     omp.atomic.update %x : memref<i32> {
@@ -1733,7 +1733,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   }
 
   // CHECK: omp.atomic.capture memory_order(acq_rel) {
@@ -1742,7 +1742,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
   // CHECK-NEXT:   %[[newval:.*]] = llvm.add %[[xval]], %[[expr]] : i32
   // CHECK-NEXT:   omp.yield(%[[newval]] : i32)
   // CHECK-NEXT: }
-  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>
+  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
   // CHECK-NEXT: }
   omp.atomic.capture memory_order(acq_rel) {
     omp.atomic.update %x : memref<i32> {
@@ -1750,7 +1750,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   }
 
   // CHECK: omp.atomic.capture memory_order(acquire) {
@@ -1759,7 +1759,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
   // CHECK-NEXT:   %[[newval:.*]] = llvm.add %[[xval]], %[[expr]] : i32
   // CHECK-NEXT:   omp.yield(%[[newval]] : i32)
   // CHECK-NEXT: }
-  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, i32
+  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
   // CHECK-NEXT: }
   omp.atomic.capture memory_order(acquire) {
     omp.atomic.update %x : memref<i32> {
@@ -1767,7 +1767,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   }
 
   // CHECK: omp.atomic.capture memory_order(release) {
@@ -1776,7 +1776,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
   // CHECK-NEXT:   %[[newval:.*]] = llvm.add %[[xval]], %[[expr]] : i32
   // CHECK-NEXT:   omp.yield(%[[newval]] : i32)
   // CHECK-NEXT: }
-  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, i32
+  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
   // CHECK-NEXT: }
   omp.atomic.capture memory_order(release) {
     omp.atomic.update %x : memref<i32> {
@@ -1784,7 +1784,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   }
 
   // CHECK: omp.atomic.capture memory_order(relaxed) {
@@ -1793,7 +1793,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
   // CHECK-NEXT:   %[[newval:.*]] = llvm.add %[[xval]], %[[expr]] : i32
   // CHECK-NEXT:   omp.yield(%[[newval]] : i32)
   // CHECK-NEXT: }
-  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, i32
+  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
   // CHECK-NEXT: }
   omp.atomic.capture memory_order(relaxed) {
     omp.atomic.update %x : memref<i32> {
@@ -1801,7 +1801,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   }
 
   // CHECK: omp.atomic.capture hint(contended, speculative) memory_order(seq_cst) {
@@ -1810,7 +1810,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
   // CHECK-NEXT:   %[[newval:.*]] = llvm.add %[[xval]], %[[expr]] : i32
   // CHECK-NEXT:   omp.yield(%[[newval]] : i32)
   // CHECK-NEXT: }
-  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, i32
+  // CHECK-NEXT: omp.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
   // CHECK-NEXT: }
   omp.atomic.capture hint(contended, speculative) memory_order(seq_cst) {
     omp.atomic.update %x : memref<i32> {
@@ -1818,7 +1818,7 @@ func.func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : memref<i32>, i32
+    omp.atomic.read %v = %x : memref<i32>, memref<i32>, i32
   }
 
   return
@@ -2420,14 +2420,14 @@ func.func @omp_requires_multiple() -> ()
 // CHECK-LABEL: @opaque_pointers_atomic_rwu
 // CHECK-SAME: (%[[v:.*]]: !llvm.ptr, %[[x:.*]]: !llvm.ptr)
 func.func @opaque_pointers_atomic_rwu(%v: !llvm.ptr, %x: !llvm.ptr) {
-  // CHECK: omp.atomic.read %[[v]] = %[[x]] : !llvm.ptr, i32
+  // CHECK: omp.atomic.read %[[v]] = %[[x]] : !llvm.ptr, !llvm.ptr, i32
   // CHECK: %[[VAL:.*]] = llvm.load %[[x]] : !llvm.ptr -> i32
   // CHECK: omp.atomic.write %[[v]] = %[[VAL]] : !llvm.ptr, i32
   // CHECK: omp.atomic.update %[[x]] : !llvm.ptr {
   // CHECK-NEXT: ^{{[[:alnum:]]+}}(%[[XVAL:.*]]: i32):
   // CHECK-NEXT:   omp.yield(%[[XVAL]] : i32)
   // CHECK-NEXT: }
-  omp.atomic.read %v = %x : !llvm.ptr, i32
+  omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
   %val = llvm.load %x : !llvm.ptr -> i32
   omp.atomic.write %v = %val : !llvm.ptr, i32
   omp.atomic.update %x : !llvm.ptr {
diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir
index 3bcf580..67cd01f 100644
--- a/mlir/test/Dialect/Tosa/canonicalize.mlir
+++ b/mlir/test/Dialect/Tosa/canonicalize.mlir
@@ -217,6 +217,17 @@ func.func @pad_noop(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
 
 // -----
 
+// CHECK-LABEL: @pad_noop_padding_mismatch_nofold
+func.func @pad_noop_padding_mismatch_nofold(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  // CHECK: %[[PAD:.+]] = tosa.pad
+  // CHECK: return %[[PAD]]
+  %0 = "tosa.const"() { value = dense_resource<__elided__> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+  %1 = tosa.pad %arg0, %0 : (tensor<?x?xf32>, tensor<2x2xi32>) -> tensor<?x?xf32>
+  return %1 : tensor<?x?xf32>
+}
+
+// -----
+
 // CHECK-LABEL: @pad_noop_type_mismatch_nofold
 func.func @pad_noop_type_mismatch_nofold(%arg0: tensor<10xf32>) -> tensor<?xf32> {
   // CHECK: %[[PAD:.+]] = tosa.pad
diff --git a/mlir/test/Dialect/Vector/int-range-interface.mlir b/mlir/test/Dialect/Vector/int-range-interface.mlir
index 2928242..09dfe93 100644
--- a/mlir/test/Dialect/Vector/int-range-interface.mlir
+++ b/mlir/test/Dialect/Vector/int-range-interface.mlir
@@ -96,7 +96,7 @@ func.func @vector_insertelement() -> vector<4xindex> {
 
 // CHECK-LABEL: func @test_loaded_vector_extract
 // No bounds
-// CHECK: test.reflect_bounds %{{.*}} : i32
+// CHECK: test.reflect_bounds {smax = 2147483647 : si32, smin = -2147483648 : si32, umax = 4294967295 : ui32, umin = 0 : ui32} %{{.*}} : i32
 func.func @test_loaded_vector_extract(%memref : memref<16xi32>) -> i32 {
   %c0 = arith.constant 0 : index
   %v = vector.load %memref[%c0] : memref<16xi32>, vector<4xi32>
@@ -104,3 +104,12 @@ func.func @test_loaded_vector_extract(%memref : memref<16xi32>) -> i32 {
   %bounds = test.reflect_bounds %e : i32
   func.return %bounds : i32
 }
+
+// CHECK-LABEL: func @test_vector_extsi
+// CHECK: test.reflect_bounds {smax = 5 : si32, smin = 1 : si32, umax = 5 : ui32, umin = 1 : ui32}
+func.func @test_vector_extsi() -> vector<2xi32> {
+  %0 = test.with_bounds {smax = 5 : si8, smin = 1 : si8, umax = 5 : ui8, umin = 1 : ui8 } : vector<2xi8>
+  %1 = arith.extsi %0 : vector<2xi8> to vector<2xi32>
+  %2 = test.reflect_bounds %1 : vector<2xi32>
+  func.return %2 : vector<2xi32>
+}
diff --git a/mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned.mlir b/mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned.mlir
index 7ecbad7..0cecadd 100644
--- a/mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned.mlir
+++ b/mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned.mlir
@@ -1,59 +1,65 @@
 // RUN: mlir-opt --test-emulate-narrow-int="arith-compute-bitwidth=1 memref-load-bitwidth=8" --cse --split-input-file %s | FileCheck %s
 
-func.func @vector_load_i2(%arg1: index, %arg2: index) -> vector<3x3xi2> {
-    %0 = memref.alloc() : memref<3x3xi2>
-    %c0 = arith.constant 0 : index
-    %c2 = arith.constant 2 : index
-    %cst = arith.constant dense<0> : vector<3x3xi2>
-    %1 = vector.load %0[%c2, %c0] : memref<3x3xi2>, vector<3xi2>
-    %2 = vector.insert %1, %cst [0] : vector<3xi2> into vector<3x3xi2>
-    return %2 : vector<3x3xi2>
+// TODO: remove memref.alloc() in the tests to eliminate noises.
+// memref.alloc exists here because sub-byte vector data types such as i2
+// are currently not supported as input arguments.
+
+
+func.func @vector_load_i2() -> vector<3x3xi2> {
+  %0 = memref.alloc() : memref<3x3xi2>
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %cst = arith.constant dense<0> : vector<3x3xi2>
+  %1 = vector.load %0[%c2, %c0] : memref<3x3xi2>, vector<3xi2>
+  %2 = vector.insert %1, %cst [0] : vector<3xi2> into vector<3x3xi2>
+  return %2 : vector<3x3xi2>
 }
 
-// CHECK: func @vector_load_i2
+// CHECK-LABEL: func @vector_load_i2
 // CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<3xi8>
 // CHECK: %[[INDEX:.+]] = arith.constant 1 : index
 // CHECK: %[[VEC:.+]] = vector.load %[[ALLOC]][%[[INDEX]]] : memref<3xi8>, vector<2xi8>
 // CHECK: %[[VEC_I2:.+]] = vector.bitcast %[[VEC]] : vector<2xi8> to vector<8xi2>
 // CHECK: %[[EXCTRACT:.+]] = vector.extract_strided_slice %[[VEC_I2]] {offsets = [2], sizes = [3], strides = [1]} : vector<8xi2> to vector<3xi2>
 
-//-----
+// -----
 
 func.func @vector_transfer_read_i2() -> vector<3xi2> {
- %0 = memref.alloc() : memref<3x3xi2>
- %c0i2 = arith.constant 0 : i2
- %c0 = arith.constant 0 : index
- %c2 = arith.constant 2 : index
- %1 = vector.transfer_read %0[%c2, %c0], %c0i2 {in_bounds = [true]} : memref<3x3xi2>, vector<3xi2>
- return %1 : vector<3xi2>
+  %0 = memref.alloc() : memref<3x3xi2>
+  %pad = arith.constant 0 : i2
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %1 = vector.transfer_read %0[%c2, %c0], %pad {in_bounds = [true]} : memref<3x3xi2>, vector<3xi2>
+  return %1 : vector<3xi2>
 }
 
-// CHECK: func @vector_transfer_read_i2
+// CHECK-LABEL: func @vector_transfer_read_i2
 // CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<3xi8>
 // CHECK: %[[INDEX:.+]] = arith.constant 1 : index
 // CHECK: %[[READ:.+]] = vector.transfer_read %[[ALLOC]][%[[INDEX]]], %0 : memref<3xi8>, vector<2xi8>
 // CHECK: %[[BITCAST:.+]] = vector.bitcast %[[READ]] : vector<2xi8> to vector<8xi2>
 // CHECK: vector.extract_strided_slice %[[BITCAST]] {offsets = [2], sizes = [3], strides = [1]} : vector<8xi2> to vector<3xi2>
 
-//-----
+// -----
 
 func.func @vector_cst_maskedload_i2(%passthru: vector<5xi2>) -> vector<3x5xi2> {
-    %0 = memref.alloc() : memref<3x5xi2>
-    %cst = arith.constant dense<0> : vector<3x5xi2>
-    %mask = vector.constant_mask [3] : vector<5xi1>
-    %c0 = arith.constant 0 : index
-    %c2 = arith.constant 2 : index
-    %1 = vector.maskedload %0[%c2, %c0], %mask, %passthru :
-      memref<3x5xi2>, vector<5xi1>, vector<5xi2> into vector<5xi2>
-    %2 = vector.insert %1, %cst [0] : vector<5xi2> into vector<3x5xi2>
-    return %2 : vector<3x5xi2>
+  %0 = memref.alloc() : memref<3x5xi2>
+  %cst = arith.constant dense<0> : vector<3x5xi2>
+  %mask = vector.constant_mask [3] : vector<5xi1>
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %1 = vector.maskedload %0[%c2, %c0], %mask, %passthru :
+    memref<3x5xi2>, vector<5xi1>, vector<5xi2> into vector<5xi2>
+  %2 = vector.insert %1, %cst [0] : vector<5xi2> into vector<3x5xi2>
+  return %2 : vector<3x5xi2>
 }
 
-// CHECK: func @vector_cst_maskedload_i2
+// CHECK-LABEL: func @vector_cst_maskedload_i2(
+// CHECK-SAME: %[[ARG0:.+]]: vector<5xi2>) -> vector<3x5xi2>
 // CHECK: %[[ORIGINMASK:.+]] = vector.constant_mask [3] : vector<5xi1>
 // CHECK: %[[NEWMASK:.+]] = arith.constant dense<true> : vector<2xi1>
 // CHECK: %[[VESSEL:.+]] = arith.constant dense<0> : vector<8xi2>
-// CHECK: %[[INSERT1:.+]] = vector.insert_strided_slice %arg0, %[[VESSEL]]
+// CHECK: %[[INSERT1:.+]] = vector.insert_strided_slice %[[ARG0]], %[[VESSEL]]
 // CHECK-SAME: {offsets = [2], strides = [1]} : vector<5xi2> into vector<8xi2>
 // CHECK: %[[BITCAST1:.+]] = vector.bitcast %[[INSERT1]] : vector<8xi2> to vector<2xi8>
 // CHECK: %[[C2:.+]] = arith.constant 2 : index
@@ -64,4 +70,116 @@ func.func @vector_cst_maskedload_i2(%passthru: vector<5xi2>) -> vector<3x5xi2> {
 // CHECK: %[[INSERT2:.+]] = vector.insert_strided_slice %[[ORIGINMASK]], %[[CST2]]
 // CHECK-SAME: {offsets = [2], strides = [1]} : vector<5xi1> into vector<8xi1>
 // CHECK: %[[SELECT:.+]] = arith.select %[[INSERT2]], %[[BITCAST2]], %[[INSERT1]] : vector<8xi1>, vector<8xi2>
-// CHECK: vector.extract_strided_slice %[[SELECT]] {offsets = [2], sizes = [5], strides = [1]} : vector<8xi2> to vector<5xi2> 
+// CHECK: vector.extract_strided_slice %[[SELECT]] {offsets = [2], sizes = [5], strides = [1]} : vector<8xi2> to vector<5xi2>
+
+// -----
+
+func.func @vector_load_i2_dynamic_indexing(%idx1: index, %idx2: index) -> vector<3xi2> {
+  %0 = memref.alloc() : memref<3x3xi2>
+  %cst = arith.constant dense<0> : vector<3x3xi2>
+  %1 = vector.load %0[%idx1, %idx2] : memref<3x3xi2>, vector<3xi2>
+  return %1 : vector<3xi2>
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<()[s0, s1] -> ((s0 * 3 + s1) floordiv 4)>
+// CHECK: #[[MAP1:.+]] = affine_map<()[s0, s1] -> ((s0 * 3 + s1) mod 4)>
+// CHECK: func @vector_load_i2_dynamic_indexing(
+// CHECK-SAME: %[[ARG0:.+]]: index, %[[ARG1:.+]]: index) -> vector<3xi2>
+// CHECK: %[[ALLOC:.+]]= memref.alloc() : memref<3xi8>
+// CHECK: %[[LOADADDR1:.+]] = affine.apply #[[MAP]]()[%[[ARG0]], %[[ARG1]]]
+// CHECK: %[[LOADADDR2:.+]] = affine.apply #[[MAP1]]()[%[[ARG0]], %[[ARG1]]]
+// CHECK: %[[EMULATED_LOAD:.+]] = vector.load %alloc[%[[LOADADDR1]]] : memref<3xi8>, vector<2xi8>
+// CHECK: %[[BITCAST:.+]] = vector.bitcast %[[EMULATED_LOAD]] : vector<2xi8> to vector<8xi2>
+// CHECK: %[[ZERO:.+]] = arith.constant dense<0> : vector<3xi2>
+// CHECK: %[[EXTRACT:.+]] = vector.extract %[[BITCAST]][%[[LOADADDR2]]] : i2 from vector<8xi2>
+// CHECK: %[[C1:.+]] = arith.constant 1 : index
+// CHECK: %[[OFFSET:.+]] = arith.addi %[[LOADADDR2]], %[[C1]] : index
+// CHECK: %[[EXTRACT2:.+]] = vector.extract %[[BITCAST]][%[[OFFSET]]] : i2 from vector<8xi2>
+// CHECK: %[[C2:.+]] = arith.constant 2 : index
+// CHECK: %[[OFFSET2:.+]] = arith.addi %1, %c2 : index
+// CHECK: %[[EXTRACT3:.+]] = vector.extract %[[BITCAST]][%[[OFFSET2]]] : i2 from vector<8xi2>
+
+// -----
+
+func.func @vector_load_i2_dynamic_indexing_mixed(%idx: index) -> vector<3xi2> {
+  %0 = memref.alloc() : memref<3x3xi2>
+  %c2 = arith.constant 2 : index
+  %cst = arith.constant dense<1> : vector<3x3xi2>
+  %1 = vector.load %0[%idx, %c2] : memref<3x3xi2>, vector<3xi2>
+  return %1 : vector<3xi2>
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<()[s0] -> ((s0 * 3 + 2) floordiv 4)>
+// CHECK: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 * 3 - ((s0 * 3 + 2) floordiv 4) * 4 + 2)>
+// CHECK: func @vector_load_i2_dynamic_indexing_mixed(
+// CHECK-SAME: %[[ARG0:.+]]: index) -> vector<3xi2>
+// CHECK: %[[ALLOC:.+]]= memref.alloc() : memref<3xi8>
+// CHECK: %[[LOADADDR1:.+]] = affine.apply #[[MAP]]()[%[[ARG0]]]
+// CHECK: %[[LOADADDR2:.+]] = affine.apply #[[MAP1]]()[%[[ARG0]]]
+// CHECK: %[[EMULATED_LOAD:.+]] = vector.load %alloc[%[[LOADADDR1]]] : memref<3xi8>, vector<2xi8>
+// CHECK: %[[BITCAST:.+]] = vector.bitcast %[[EMULATED_LOAD]] : vector<2xi8> to vector<8xi2>
+// CHECK: %[[ZERO:.+]] = arith.constant dense<0> : vector<3xi2>
+// CHECK: %[[EXTRACT:.+]] = vector.extract %[[BITCAST]][%[[LOADADDR2]]] : i2 from vector<8xi2>
+// CHECK: %[[C1:.+]] = arith.constant 1 : index
+// CHECK: %[[OFFSET:.+]] = arith.addi %[[LOADADDR2]], %[[C1]] : index
+// CHECK: %[[EXTRACT2:.+]] = vector.extract %[[BITCAST]][%[[OFFSET]]] : i2 from vector<8xi2>
+// CHECK: %[[C2:.+]] = arith.constant 2 : index
+// CHECK: %[[OFFSET2:.+]] = arith.addi %1, %c2 : index
+// CHECK: %[[EXTRACT3:.+]] = vector.extract %[[BITCAST]][%[[OFFSET2]]] : i2 from vector<8xi2>
+
+// -----
+
+func.func @vector_transfer_read_i2_dynamic_indexing(%idx1: index, %idx2: index) -> vector<3xi2> {
+  %0 = memref.alloc() : memref<3x3xi2>
+  %pad = arith.constant 0 : i2
+  %1 = vector.transfer_read %0[%idx1, %idx2], %pad {in_bounds = [true]} : memref<3x3xi2>, vector<3xi2>
+  return %1 : vector<3xi2>
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<()[s0, s1] -> ((s0 * 3 + s1) floordiv 4)>
+// CHECK: #[[MAP1:.+]] = affine_map<()[s0, s1] -> ((s0 * 3 + s1) mod 4)>
+// CHECK: func @vector_transfer_read_i2_dynamic_indexing(
+// CHECK-SAME: %[[ARG0:.+]]: index, %[[ARG1:.+]]: index) -> vector<3xi2>
+// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<3xi8>
+// CHECK: %[[C0:.+]] = arith.extui %c0_i2 : i2 to i8
+// CHECK: %[[LOADADDR1:.+]] = affine.apply #[[MAP]]()[%[[ARG0]], %[[ARG1]]]
+// CHECK: %[[LOADADDR2:.+]] = affine.apply #[[MAP1]]()[%[[ARG0]], %[[ARG1]]]
+// CHECK: %[[READ:.+]] = vector.transfer_read %[[ALLOC]][%[[LOADADDR1]]], %[[C0]] : memref<3xi8>, vector<2xi8>
+// CHECK: %[[BITCAST:.+]] = vector.bitcast %[[READ]] : vector<2xi8> to vector<8xi2>
+// CHECK: %[[CST:.+]] = arith.constant dense<0> : vector<3xi2>
+// CHECK: %[[EXTRACT:.+]] = vector.extract %[[BITCAST]][%[[LOADADDR2]]] : i2 from vector<8xi2>
+// CHECK: %[[C1:.+]] = arith.constant 1 : index
+// CHECK: %[[ADDI:.+]] = arith.addi %[[LOADADDR2]], %[[C1]] : index
+// CHECK: %[[EXTRACT2:.+]] = vector.extract %[[BITCAST]][%[[ADDI]]] : i2 from vector<8xi2>
+// CHECK: %[[C2:.+]] = arith.constant 2 : index
+// CHECK: %[[ADDI2:.+]] = arith.addi %[[LOADADDR2]], %[[C2]] : index
+// CHECK: %[[EXTRACT3:.+]] = vector.extract %[[BITCAST]][%[[ADDI2]]] : i2 from vector<8xi2>
+
+// -----
+
+func.func @vector_transfer_read_i2_dynamic_indexing_mixed(%idx1: index) -> vector<3xi2> {
+  %0 = memref.alloc() : memref<3x3xi2>
+  %c2 = arith.constant 2 : index
+  %pad = arith.constant 0 : i2
+  %1 = vector.transfer_read %0[%idx1, %c2], %pad {in_bounds = [true]} : memref<3x3xi2>, vector<3xi2>
+  return %1 : vector<3xi2>
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<()[s0] -> ((s0 * 3 + 2) floordiv 4)>
+// CHECK: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 * 3 - ((s0 * 3 + 2) floordiv 4) * 4 + 2)>
+// CHECK: func @vector_transfer_read_i2_dynamic_indexing_mixed(
+// CHECK-SAME: %[[ARG0:.+]]: index) -> vector<3xi2>
+// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<3xi8>
+// CHECK: %[[C0:.+]] = arith.extui %c0_i2 : i2 to i8
+// CHECK: %[[LOADADDR1:.+]] = affine.apply #[[MAP]]()[%[[ARG0]]]
+// CHECK: %[[LOADADDR2:.+]] = affine.apply #[[MAP1]]()[%[[ARG0]]]
+// CHECK: %[[READ:.+]] = vector.transfer_read %[[ALLOC]][%[[LOADADDR1]]], %[[C0]] : memref<3xi8>, vector<2xi8>
+// CHECK: %[[BITCAST:.+]] = vector.bitcast %[[READ]] : vector<2xi8> to vector<8xi2>
+// CHECK: %[[CST:.+]] = arith.constant dense<0> : vector<3xi2>
+// CHECK: %[[EXTRACT:.+]] = vector.extract %[[BITCAST]][%[[LOADADDR2]]] : i2 from vector<8xi2>
+// CHECK: %[[C1:.+]] = arith.constant 1 : index
+// CHECK: %[[ADDI:.+]] = arith.addi %[[LOADADDR2]], %[[C1]] : index
+// CHECK: %[[EXTRACT2:.+]] = vector.extract %[[BITCAST]][%[[ADDI]]] : i2 from vector<8xi2>
+// CHECK: %[[C2:.+]] = arith.constant 2 : index
+// CHECK: %[[ADDI2:.+]] = arith.addi %[[LOADADDR2]], %[[C2]] : index
+// CHECK: %[[EXTRACT3:.+]] = vector.extract %[[BITCAST]][%[[ADDI2]]] : i2 from vector<8xi2>
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm-invalid.mlir b/mlir/test/Target/LLVMIR/openmp-llvm-invalid.mlir
index 3c6ca1e..41bc5c4 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm-invalid.mlir
@@ -35,7 +35,7 @@ llvm.func @omp_atomic_update_multiple_step_update(%x: !llvm.ptr, %expr: i32) {
 llvm.func @omp_atomic_update_multiple_step_update(%x: !llvm.ptr, %v: !llvm.ptr, %expr: i32) {
   // expected-error @+1 {{LLVM Translation failed for operation: omp.atomic.capture}}
   omp.atomic.capture memory_order(seq_cst) {
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     // expected-error @+1 {{no atomic update operation with region argument as operand found inside atomic.update region}}
     omp.atomic.update %x : !llvm.ptr {
     ^bb0(%xval: i32):
@@ -52,7 +52,7 @@ llvm.func @omp_atomic_update_multiple_step_update(%x: !llvm.ptr, %v: !llvm.ptr,
 // update operation
 llvm.func @omp_atomic_update_multiple_step_update(%x: !llvm.ptr, %v: !llvm.ptr, %expr: i32) {
   omp.atomic.capture memory_order(seq_cst) {
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.update %x : !llvm.ptr {
     ^bb0(%xval: i32):
       %t1 = llvm.mul %xval, %expr : i32
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index 49f9f35..e68102e 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -1348,21 +1348,21 @@ llvm.func @omp_atomic_read(%arg0 : !llvm.ptr, %arg1 : !llvm.ptr) -> () {
 
   // CHECK: %[[X1:.*]] = load atomic i32, ptr %[[ARG0]] monotonic, align 4
   // CHECK: store i32 %[[X1]], ptr %[[ARG1]], align 4
-  omp.atomic.read %arg1 = %arg0 : !llvm.ptr, i32
+  omp.atomic.read %arg1 = %arg0 : !llvm.ptr, !llvm.ptr, i32
 
   // CHECK: %[[X2:.*]] = load atomic i32, ptr %[[ARG0]] seq_cst, align 4
   // CHECK: call void @__kmpc_flush(ptr @{{.*}})
   // CHECK: store i32 %[[X2]], ptr %[[ARG1]], align 4
-  omp.atomic.read %arg1 = %arg0 memory_order(seq_cst) : !llvm.ptr, i32
+  omp.atomic.read %arg1 = %arg0 memory_order(seq_cst) : !llvm.ptr, !llvm.ptr, i32
 
   // CHECK: %[[X3:.*]] = load atomic i32, ptr %[[ARG0]] acquire, align 4
   // CHECK: call void @__kmpc_flush(ptr @{{.*}})
   // CHECK: store i32 %[[X3]], ptr %[[ARG1]], align 4
-  omp.atomic.read %arg1 = %arg0 memory_order(acquire) : !llvm.ptr, i32
+  omp.atomic.read %arg1 = %arg0 memory_order(acquire) : !llvm.ptr, !llvm.ptr, i32
 
   // CHECK: %[[X4:.*]] = load atomic i32, ptr %[[ARG0]] monotonic, align 4
   // CHECK: store i32 %[[X4]], ptr %[[ARG1]], align 4
-  omp.atomic.read %arg1 = %arg0 memory_order(relaxed) : !llvm.ptr, i32
+  omp.atomic.read %arg1 = %arg0 memory_order(relaxed) : !llvm.ptr, !llvm.ptr, i32
   llvm.return
 }
 
@@ -1518,7 +1518,7 @@ llvm.func @_QPomp_atomic_capture_complex() {
         %20 = llvm.insertvalue %17, %19[1] : !llvm.struct<(f32, f32)>
         omp.yield(%20 : !llvm.struct<(f32, f32)>)
       }
-      omp.atomic.read %1 = %3 : !llvm.ptr, !llvm.struct<(f32, f32)>
+      omp.atomic.read %1 = %3 : !llvm.ptr, !llvm.ptr, !llvm.struct<(f32, f32)>
     }
     llvm.return
 }
@@ -1541,7 +1541,7 @@ llvm.func @omp_atomic_read_complex(){
     %1 = llvm.alloca %0 x !llvm.struct<(f32, f32)> {bindc_name = "ib"} : (i64) -> !llvm.ptr
     %2 = llvm.mlir.constant(1 : i64) : i64
     %3 = llvm.alloca %2 x !llvm.struct<(f32, f32)> {bindc_name = "ia"} : (i64) -> !llvm.ptr
-    omp.atomic.read %1 = %3 : !llvm.ptr, !llvm.struct<(f32, f32)>
+    omp.atomic.read %1 = %3 : !llvm.ptr, !llvm.ptr, !llvm.struct<(f32, f32)>
     llvm.return
 }
 
@@ -1646,7 +1646,7 @@ llvm.func @omp_atomic_capture_prefix_update(
       %newval = llvm.add %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
   }
 
   // CHECK: %[[res:.*]] = atomicrmw sub ptr %[[x]], i32 %[[expr]] monotonic
@@ -1658,7 +1658,7 @@ llvm.func @omp_atomic_capture_prefix_update(
       %newval = llvm.sub %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
   }
 
   // CHECK: %[[res:.*]] = atomicrmw and ptr %[[x]], i32 %[[expr]] monotonic
@@ -1670,7 +1670,7 @@ llvm.func @omp_atomic_capture_prefix_update(
       %newval = llvm.and %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
   }
 
   // CHECK: %[[res:.*]] = atomicrmw or ptr %[[x]], i32 %[[expr]] monotonic
@@ -1682,7 +1682,7 @@ llvm.func @omp_atomic_capture_prefix_update(
       %newval = llvm.or %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
   }
 
   // CHECK: %[[res:.*]] = atomicrmw xor ptr %[[x]], i32 %[[expr]] monotonic
@@ -1694,7 +1694,7 @@ llvm.func @omp_atomic_capture_prefix_update(
       %newval = llvm.xor %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
   }
 
   // CHECK: %[[xval:.*]] = phi i32
@@ -1709,7 +1709,7 @@ llvm.func @omp_atomic_capture_prefix_update(
       %newval = llvm.mul %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
   }
 
   // CHECK: %[[xval:.*]] = phi i32
@@ -1724,7 +1724,7 @@ llvm.func @omp_atomic_capture_prefix_update(
       %newval = llvm.sdiv %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
   }
 
   // CHECK: %[[xval:.*]] = phi i32
@@ -1739,7 +1739,7 @@ llvm.func @omp_atomic_capture_prefix_update(
       %newval = llvm.udiv %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
   }
 
   // CHECK: %[[xval:.*]] = phi i32
@@ -1754,7 +1754,7 @@ llvm.func @omp_atomic_capture_prefix_update(
       %newval = llvm.shl %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
   }
 
   // CHECK: %[[xval:.*]] = phi i32
@@ -1769,7 +1769,7 @@ llvm.func @omp_atomic_capture_prefix_update(
       %newval = llvm.lshr %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
   }
 
   // CHECK: %[[xval:.*]] = phi i32
@@ -1784,7 +1784,7 @@ llvm.func @omp_atomic_capture_prefix_update(
       %newval = llvm.ashr %xval, %expr : i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
   }
 
   // CHECK: %[[xval:.*]] = phi i32
@@ -1799,7 +1799,7 @@ llvm.func @omp_atomic_capture_prefix_update(
       %newval = "llvm.intr.smax"(%xval, %expr) : (i32, i32) -> i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
   }
 
   // CHECK: %[[xval:.*]] = phi i32
@@ -1814,7 +1814,7 @@ llvm.func @omp_atomic_capture_prefix_update(
       %newval = "llvm.intr.smin"(%xval, %expr) : (i32, i32) -> i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
   }
 
   // CHECK: %[[xval:.*]] = phi i32
@@ -1829,7 +1829,7 @@ llvm.func @omp_atomic_capture_prefix_update(
       %newval = "llvm.intr.umax"(%xval, %expr) : (i32, i32) -> i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
   }
 
   // CHECK: %[[xval:.*]] = phi i32
@@ -1844,7 +1844,7 @@ llvm.func @omp_atomic_capture_prefix_update(
       %newval = "llvm.intr.umin"(%xval, %expr) : (i32, i32) -> i32
       omp.yield(%newval : i32)
     }
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
   }
 
   // CHECK: %[[xval:.*]] = phi i32
@@ -1859,7 +1859,7 @@ llvm.func @omp_atomic_capture_prefix_update(
       %newval = llvm.fadd %xval, %exprf : f32
       omp.yield(%newval : f32)
     }
-    omp.atomic.read %vf = %xf : !llvm.ptr, f32
+    omp.atomic.read %vf = %xf : !llvm.ptr, !llvm.ptr, f32
   }
 
   // CHECK: %[[xval:.*]] = phi i32
@@ -1874,7 +1874,7 @@ llvm.func @omp_atomic_capture_prefix_update(
       %newval = llvm.fsub %xval, %exprf : f32
       omp.yield(%newval : f32)
     }
-    omp.atomic.read %vf = %xf : !llvm.ptr, f32
+    omp.atomic.read %vf = %xf : !llvm.ptr, !llvm.ptr, f32
   }
 
   llvm.return
@@ -1890,7 +1890,7 @@ llvm.func @omp_atomic_capture_postfix_update(
   // CHECK: %[[res:.*]] = atomicrmw add ptr %[[x]], i32 %[[expr]] monotonic
   // CHECK: store i32 %[[res]], ptr %[[v]]
   omp.atomic.capture {
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.update %x : !llvm.ptr {
     ^bb0(%xval: i32):
       %newval = llvm.add %xval, %expr : i32
@@ -1901,7 +1901,7 @@ llvm.func @omp_atomic_capture_postfix_update(
   // CHECK: %[[res:.*]] = atomicrmw sub ptr %[[x]], i32 %[[expr]] monotonic
   // CHECK: store i32 %[[res]], ptr %[[v]]
   omp.atomic.capture {
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.update %x : !llvm.ptr {
     ^bb0(%xval: i32):
       %newval = llvm.sub %xval, %expr : i32
@@ -1912,7 +1912,7 @@ llvm.func @omp_atomic_capture_postfix_update(
   // CHECK: %[[res:.*]] = atomicrmw and ptr %[[x]], i32 %[[expr]] monotonic
   // CHECK: store i32 %[[res]], ptr %[[v]]
   omp.atomic.capture {
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.update %x : !llvm.ptr {
     ^bb0(%xval: i32):
       %newval = llvm.and %xval, %expr : i32
@@ -1923,7 +1923,7 @@ llvm.func @omp_atomic_capture_postfix_update(
   // CHECK: %[[res:.*]] = atomicrmw or ptr %[[x]], i32 %[[expr]] monotonic
   // CHECK: store i32 %[[res]], ptr %[[v]]
   omp.atomic.capture {
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.update %x : !llvm.ptr {
     ^bb0(%xval: i32):
       %newval = llvm.or %xval, %expr : i32
@@ -1934,7 +1934,7 @@ llvm.func @omp_atomic_capture_postfix_update(
   // CHECK: %[[res:.*]] = atomicrmw xor ptr %[[x]], i32 %[[expr]] monotonic
   // CHECK: store i32 %[[res]], ptr %[[v]]
   omp.atomic.capture {
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.update %x : !llvm.ptr {
     ^bb0(%xval: i32):
       %newval = llvm.xor %xval, %expr : i32
@@ -1949,7 +1949,7 @@ llvm.func @omp_atomic_capture_postfix_update(
   // CHECK-NEXT: %{{.*}} = cmpxchg ptr %[[x]], i32 %[[xval]], i32 %[[newval_]] monotonic monotonic
   // CHECK: store i32 %[[xval]], ptr %[[v]]
   omp.atomic.capture {
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.update %x : !llvm.ptr {
     ^bb0(%xval: i32):
       %newval = llvm.mul %xval, %expr : i32
@@ -1964,7 +1964,7 @@ llvm.func @omp_atomic_capture_postfix_update(
   // CHECK-NEXT: %{{.*}} = cmpxchg ptr %[[x]], i32 %[[xval]], i32 %[[newval_]] monotonic monotonic
   // CHECK: store i32 %[[xval]], ptr %[[v]]
   omp.atomic.capture {
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.update %x : !llvm.ptr {
     ^bb0(%xval: i32):
       %newval = llvm.sdiv %xval, %expr : i32
@@ -1979,7 +1979,7 @@ llvm.func @omp_atomic_capture_postfix_update(
   // CHECK-NEXT: %{{.*}} = cmpxchg ptr %[[x]], i32 %[[xval]], i32 %[[newval_]] monotonic monotonic
   // CHECK: store i32 %[[xval]], ptr %[[v]]
   omp.atomic.capture {
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.update %x : !llvm.ptr {
     ^bb0(%xval: i32):
       %newval = llvm.udiv %xval, %expr : i32
@@ -1994,7 +1994,7 @@ llvm.func @omp_atomic_capture_postfix_update(
   // CHECK-NEXT: %{{.*}} = cmpxchg ptr %[[x]], i32 %[[xval]], i32 %[[newval_]] monotonic monotonic
   // CHECK: store i32 %[[xval]], ptr %[[v]]
   omp.atomic.capture {
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.update %x : !llvm.ptr {
     ^bb0(%xval: i32):
       %newval = llvm.shl %xval, %expr : i32
@@ -2009,7 +2009,7 @@ llvm.func @omp_atomic_capture_postfix_update(
   // CHECK-NEXT: %{{.*}} = cmpxchg ptr %[[x]], i32 %[[xval]], i32 %[[newval_]] monotonic monotonic
   // CHECK: store i32 %[[xval]], ptr %[[v]]
   omp.atomic.capture {
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.update %x : !llvm.ptr {
     ^bb0(%xval: i32):
       %newval = llvm.lshr %xval, %expr : i32
@@ -2024,7 +2024,7 @@ llvm.func @omp_atomic_capture_postfix_update(
   // CHECK-NEXT: %{{.*}} = cmpxchg ptr %[[x]], i32 %[[xval]], i32 %[[newval_]] monotonic monotonic
   // CHECK: store i32 %[[xval]], ptr %[[v]]
   omp.atomic.capture {
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.update %x : !llvm.ptr {
     ^bb0(%xval: i32):
       %newval = llvm.ashr %xval, %expr : i32
@@ -2039,7 +2039,7 @@ llvm.func @omp_atomic_capture_postfix_update(
   // CHECK-NEXT: %{{.*}} = cmpxchg ptr %[[x]], i32 %[[xval]], i32 %[[newval_]] monotonic monotonic
   // CHECK: store i32 %[[xval]], ptr %[[v]]
   omp.atomic.capture {
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.update %x : !llvm.ptr {
     ^bb0(%xval: i32):
       %newval = "llvm.intr.smax"(%xval, %expr) : (i32, i32) -> i32
@@ -2054,7 +2054,7 @@ llvm.func @omp_atomic_capture_postfix_update(
   // CHECK-NEXT: %{{.*}} = cmpxchg ptr %[[x]], i32 %[[xval]], i32 %[[newval_]] monotonic monotonic
   // CHECK: store i32 %[[xval]], ptr %[[v]]
   omp.atomic.capture {
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.update %x : !llvm.ptr {
     ^bb0(%xval: i32):
       %newval = "llvm.intr.smin"(%xval, %expr) : (i32, i32) -> i32
@@ -2069,7 +2069,7 @@ llvm.func @omp_atomic_capture_postfix_update(
   // CHECK-NEXT: %{{.*}} = cmpxchg ptr %[[x]], i32 %[[xval]], i32 %[[newval_]] monotonic monotonic
   // CHECK: store i32 %[[xval]], ptr %[[v]]
   omp.atomic.capture {
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.update %x : !llvm.ptr {
     ^bb0(%xval: i32):
       %newval = "llvm.intr.umax"(%xval, %expr) : (i32, i32) -> i32
@@ -2084,7 +2084,7 @@ llvm.func @omp_atomic_capture_postfix_update(
   // CHECK-NEXT: %{{.*}} = cmpxchg ptr %[[x]], i32 %[[xval]], i32 %[[newval_]] monotonic monotonic
   // CHECK: store i32 %[[xval]], ptr %[[v]]
   omp.atomic.capture {
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.update %x : !llvm.ptr {
     ^bb0(%xval: i32):
       %newval = "llvm.intr.umin"(%xval, %expr) : (i32, i32) -> i32
@@ -2100,7 +2100,7 @@ llvm.func @omp_atomic_capture_postfix_update(
   // CHECK: %{{.*}} = cmpxchg ptr %[[xf]], i32 %[[xval]], i32 %[[newval_]] monotonic monotonic
   // CHECK: store float %[[xvalf]], ptr %[[vf]]
   omp.atomic.capture {
-    omp.atomic.read %vf = %xf : !llvm.ptr, f32
+    omp.atomic.read %vf = %xf : !llvm.ptr, !llvm.ptr, f32
     omp.atomic.update %xf : !llvm.ptr {
     ^bb0(%xval: f32):
       %newval = llvm.fadd %xval, %exprf : f32
@@ -2116,7 +2116,7 @@ llvm.func @omp_atomic_capture_postfix_update(
   // CHECK: %{{.*}} = cmpxchg ptr %[[xf]], i32 %[[xval]], i32 %[[newval_]] monotonic monotonic
   // CHECK: store float %[[xvalf]], ptr %[[vf]]
   omp.atomic.capture {
-    omp.atomic.read %vf = %xf : !llvm.ptr, f32
+    omp.atomic.read %vf = %xf : !llvm.ptr, !llvm.ptr, f32
     omp.atomic.update %xf : !llvm.ptr {
     ^bb0(%xval: f32):
       %newval = llvm.fsub %xval, %exprf : f32
@@ -2136,7 +2136,7 @@ llvm.func @omp_atomic_capture_misc(
   // CHECK: %[[xval:.*]] = atomicrmw xchg ptr %[[x]], i32 %[[expr]] monotonic
   // CHECK: store i32 %[[xval]], ptr %[[v]]
   omp.atomic.capture{
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.write %x = %expr : !llvm.ptr, i32
   }
 
@@ -2147,14 +2147,14 @@ llvm.func @omp_atomic_capture_misc(
   // CHECK: %{{.*}} = cmpxchg ptr %[[xf]], i32 %[[xval]], i32 %[[newval_]] monotonic monotonic
   // CHECK: store float %[[xvalf]], ptr %[[vf]]
   omp.atomic.capture{
-    omp.atomic.read %vf = %xf : !llvm.ptr, f32
+    omp.atomic.read %vf = %xf : !llvm.ptr, !llvm.ptr, f32
     omp.atomic.write %xf = %exprf : !llvm.ptr, f32
   }
 
   // CHECK: %[[res:.*]] = atomicrmw add ptr %[[x]], i32 %[[expr]] seq_cst
   // CHECK: store i32 %[[res]], ptr %[[v]]
   omp.atomic.capture memory_order(seq_cst) {
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.update %x : !llvm.ptr {
     ^bb0(%xval: i32):
       %newval = llvm.add %xval, %expr : i32
@@ -2165,7 +2165,7 @@ llvm.func @omp_atomic_capture_misc(
   // CHECK: %[[res:.*]] = atomicrmw add ptr %[[x]], i32 %[[expr]] acquire
   // CHECK: store i32 %[[res]], ptr %[[v]]
   omp.atomic.capture memory_order(acquire) {
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.update %x : !llvm.ptr {
     ^bb0(%xval: i32):
       %newval = llvm.add %xval, %expr : i32
@@ -2176,7 +2176,7 @@ llvm.func @omp_atomic_capture_misc(
   // CHECK: %[[res:.*]] = atomicrmw add ptr %[[x]], i32 %[[expr]] release
   // CHECK: store i32 %[[res]], ptr %[[v]]
   omp.atomic.capture memory_order(release) {
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.update %x : !llvm.ptr {
     ^bb0(%xval: i32):
       %newval = llvm.add %xval, %expr : i32
@@ -2187,7 +2187,7 @@ llvm.func @omp_atomic_capture_misc(
   // CHECK: %[[res:.*]] = atomicrmw add ptr %[[x]], i32 %[[expr]] monotonic
   // CHECK: store i32 %[[res]], ptr %[[v]]
   omp.atomic.capture memory_order(relaxed) {
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.update %x : !llvm.ptr {
     ^bb0(%xval: i32):
       %newval = llvm.add %xval, %expr : i32
@@ -2198,7 +2198,7 @@ llvm.func @omp_atomic_capture_misc(
   // CHECK: %[[res:.*]] = atomicrmw add ptr %[[x]], i32 %[[expr]] acq_rel
   // CHECK: store i32 %[[res]], ptr %[[v]]
   omp.atomic.capture memory_order(acq_rel) {
-    omp.atomic.read %v = %x : !llvm.ptr, i32
+    omp.atomic.read %v = %x : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.update %x : !llvm.ptr {
     ^bb0(%xval: i32):
       %newval = llvm.add %xval, %expr : i32
@@ -2771,12 +2771,12 @@ llvm.func @omp_taskgroup_task(%x: i32, %y: i32, %zaddr: !llvm.ptr) {
 llvm.func @omp_opaque_pointers(%arg0 : !llvm.ptr, %arg1: !llvm.ptr, %expr: i32) -> () {
   // CHECK: %[[X1:.*]] = load atomic i32, ptr %[[ARG0]] monotonic, align 4
   // CHECK: store i32 %[[X1]], ptr %[[ARG1]], align 4
-  omp.atomic.read %arg1 = %arg0 : !llvm.ptr, i32
+  omp.atomic.read %arg1 = %arg0 : !llvm.ptr, !llvm.ptr, i32
 
   // CHECK: %[[RES:.*]] = atomicrmw add ptr %[[ARG1]], i32 %[[EXPR]] acq_rel
   // CHECK: store i32 %[[RES]], ptr %[[ARG0]]
   omp.atomic.capture memory_order(acq_rel) {
-    omp.atomic.read %arg0 = %arg1 : !llvm.ptr, i32
+    omp.atomic.read %arg0 = %arg1 : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.update %arg1 : !llvm.ptr {
     ^bb0(%xval: i32):
       %newval = llvm.add %xval, %expr : i32
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 3c9bd90..982f7e5 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -4,12 +4,12 @@
 llvm.func @atomic_hint(%v : !llvm.ptr, %x : !llvm.ptr, %expr : i32) {
   // expected-warning@below {{hint clause discarded}}
   omp.atomic.capture hint(uncontended) {
-    omp.atomic.read %x = %v : !llvm.ptr, i32
+    omp.atomic.read %x = %v : !llvm.ptr, !llvm.ptr, i32
     omp.atomic.write %v = %expr : !llvm.ptr, i32
   }
 
   // expected-warning@below {{hint clause discarded}}
-  omp.atomic.read %x = %v hint(contended) : !llvm.ptr, i32
+  omp.atomic.read %x = %v hint(contended) : !llvm.ptr, !llvm.ptr, i32
 
   // expected-warning@below {{hint clause discarded}}
   omp.atomic.write %v = %expr hint(nonspeculative) : !llvm.ptr, i32
@@ -29,7 +29,7 @@ llvm.func @atomic_hint(%v : !llvm.ptr, %x : !llvm.ptr, %expr : i32) {
 llvm.func @cancel() {
   // expected-error@below {{LLVM Translation failed for operation: omp.parallel}}
   omp.parallel {
-    // expected-error@below {{unsupported OpenMP operation: omp.cancel}}
+    // expected-error@below {{not yet implemented: omp.cancel}}
     // expected-error@below {{LLVM Translation failed for operation: omp.cancel}}
     omp.cancel cancellation_construct_type(parallel)
     omp.terminator
@@ -42,7 +42,7 @@ llvm.func @cancel() {
 llvm.func @cancellation_point() {
   // expected-error@below {{LLVM Translation failed for operation: omp.parallel}}
   omp.parallel {
-    // expected-error@below {{unsupported OpenMP operation: omp.cancellation_point}}
+    // expected-error@below {{not yet implemented: omp.cancellation_point}}
     // expected-error@below {{LLVM Translation failed for operation: omp.cancellation_point}}
     omp.cancellation_point cancellation_construct_type(parallel)
     omp.terminator
@@ -67,7 +67,7 @@ llvm.func @do_simd(%lb : i32, %ub : i32, %step : i32) {
 // -----
 
 llvm.func @distribute(%lb : i32, %ub : i32, %step : i32) {
-  // expected-error@below {{unsupported OpenMP operation: omp.distribute}}
+  // expected-error@below {{not yet implemented: omp.distribute}}
   // expected-error@below {{LLVM Translation failed for operation: omp.distribute}}
   omp.distribute {
     omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
@@ -80,7 +80,7 @@ llvm.func @distribute(%lb : i32, %ub : i32, %step : i32) {
 // -----
 
 llvm.func @ordered_region_par_level_simd() {
-  // expected-error@below {{parallelization-level clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause parallelization-level in omp.ordered.region operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.ordered.region}}
   omp.ordered.region par_level_simd {
     omp.terminator
@@ -91,7 +91,7 @@ llvm.func @ordered_region_par_level_simd() {
 // -----
 
 llvm.func @parallel_allocate(%x : !llvm.ptr) {
-  // expected-error@below {{allocate clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause allocate in omp.parallel operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.parallel}}
   omp.parallel allocate(%x : !llvm.ptr -> %x : !llvm.ptr) {
     omp.terminator
@@ -102,7 +102,7 @@ llvm.func @parallel_allocate(%x : !llvm.ptr) {
 // -----
 
 llvm.func @sections_allocate(%x : !llvm.ptr) {
-  // expected-error@below {{allocate clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause allocate in omp.sections operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.sections}}
   omp.sections allocate(%x : !llvm.ptr -> %x : !llvm.ptr) {
     omp.terminator
@@ -119,7 +119,7 @@ omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
   omp.yield(%1 : !llvm.ptr)
 }
 llvm.func @sections_private(%x : !llvm.ptr) {
-  // expected-error@below {{privatization clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause privatization in omp.sections operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.sections}}
   omp.sections private(@x.privatizer %x -> %arg0 : !llvm.ptr) {
     omp.terminator
@@ -130,7 +130,7 @@ llvm.func @sections_private(%x : !llvm.ptr) {
 // -----
 
 llvm.func @simd_aligned(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
-  // expected-error@below {{aligned clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause aligned in omp.simd operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.simd}}
   omp.simd aligned(%x : !llvm.ptr -> 32) {
     omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
@@ -143,7 +143,7 @@ llvm.func @simd_aligned(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
 // -----
 
 llvm.func @simd_linear(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
-  // expected-error@below {{linear clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause linear in omp.simd operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.simd}}
   omp.simd linear(%x = %step : !llvm.ptr) {
     omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
@@ -156,7 +156,7 @@ llvm.func @simd_linear(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
 // -----
 
 llvm.func @simd_nontemporal(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
-  // expected-error@below {{nontemporal clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause nontemporal in omp.simd operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.simd}}
   omp.simd nontemporal(%x : !llvm.ptr) {
     omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
@@ -175,7 +175,7 @@ omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
   omp.yield(%1 : !llvm.ptr)
 }
 llvm.func @simd_private(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
-  // expected-error@below {{privatization clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause privatization in omp.simd operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.simd}}
   omp.simd private(@x.privatizer %x -> %arg0 : !llvm.ptr) {
     omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
@@ -205,7 +205,7 @@ atomic {
   omp.yield
 }
 llvm.func @simd_reduction(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
-  // expected-error@below {{reduction clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause reduction in omp.simd operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.simd}}
   omp.simd reduction(@add_f32 %x -> %prv : !llvm.ptr) {
     omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
@@ -218,7 +218,7 @@ llvm.func @simd_reduction(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
 // -----
 
 llvm.func @single_allocate(%x : !llvm.ptr) {
-  // expected-error@below {{allocate clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause allocate in omp.single operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.single}}
   omp.single allocate(%x : !llvm.ptr -> %x : !llvm.ptr) {
     omp.terminator
@@ -235,7 +235,7 @@ omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
   omp.yield(%1 : !llvm.ptr)
 }
 llvm.func @single_private(%x : !llvm.ptr) {
-  // expected-error@below {{privatization clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause privatization in omp.single operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.single}}
   omp.single private(@x.privatizer %x -> %arg0 : !llvm.ptr) {
     omp.terminator
@@ -246,7 +246,7 @@ llvm.func @single_private(%x : !llvm.ptr) {
 // -----
 
 llvm.func @target_allocate(%x : !llvm.ptr) {
-  // expected-error@below {{allocate clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause allocate in omp.target operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.target}}
   omp.target allocate(%x : !llvm.ptr -> %x : !llvm.ptr) {
     omp.terminator
@@ -257,7 +257,7 @@ llvm.func @target_allocate(%x : !llvm.ptr) {
 // -----
 
 llvm.func @target_device(%x : i32) {
-  // expected-error@below {{device clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause device in omp.target operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.target}}
   omp.target device(%x : i32) {
     omp.terminator
@@ -268,7 +268,7 @@ llvm.func @target_device(%x : i32) {
 // -----
 
 llvm.func @target_has_device_addr(%x : !llvm.ptr) {
-  // expected-error@below {{has_device_addr clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause has_device_addr in omp.target operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.target}}
   omp.target has_device_addr(%x : !llvm.ptr) {
     omp.terminator
@@ -279,7 +279,7 @@ llvm.func @target_has_device_addr(%x : !llvm.ptr) {
 // -----
 
 llvm.func @target_if(%x : i1) {
-  // expected-error@below {{if clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause if in omp.target operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.target}}
   omp.target if(%x) {
     omp.terminator
@@ -307,7 +307,7 @@ atomic {
   omp.yield
 }
 llvm.func @target_in_reduction(%x : !llvm.ptr) {
-  // expected-error@below {{in_reduction clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause in_reduction in omp.target operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.target}}
   omp.target in_reduction(@add_f32 %x -> %prv : !llvm.ptr) {
     omp.terminator
@@ -318,7 +318,7 @@ llvm.func @target_in_reduction(%x : !llvm.ptr) {
 // -----
 
 llvm.func @target_is_device_ptr(%x : !llvm.ptr) {
-  // expected-error@below {{is_device_ptr clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause is_device_ptr in omp.target operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.target}}
   omp.target is_device_ptr(%x : !llvm.ptr) {
     omp.terminator
@@ -336,7 +336,7 @@ omp.private {type = firstprivate} @x.privatizer : !llvm.ptr alloc {
   omp.yield(%arg0 : !llvm.ptr)
 }
 llvm.func @target_firstprivate(%x : !llvm.ptr) {
-  // expected-error@below {{firstprivate clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause firstprivate in omp.target operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.target}}
   omp.target private(@x.privatizer %x -> %arg0 : !llvm.ptr) {
     omp.terminator
@@ -354,7 +354,7 @@ omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
   omp.yield
 }
 llvm.func @target_struct_privatization(%x : !llvm.ptr) {
-  // expected-error@below {{privatization of structures not yet supported}}
+  // expected-error@below {{not yet implemented: privatization of structures in omp.target operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.target}}
   omp.target private(@x.privatizer %x -> %arg0 : !llvm.ptr) {
     omp.terminator
@@ -365,7 +365,7 @@ llvm.func @target_struct_privatization(%x : !llvm.ptr) {
 // -----
 
 llvm.func @target_thread_limit(%x : i32) {
-  // expected-error@below {{thread_limit clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause thread_limit in omp.target operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.target}}
   omp.target thread_limit(%x : i32) {
     omp.terminator
@@ -376,7 +376,7 @@ llvm.func @target_thread_limit(%x : i32) {
 // -----
 
 llvm.func @target_enter_data_depend(%x: !llvm.ptr) {
-  // expected-error@below {{depend clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause depend in omp.target_enter_data operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.target_enter_data}}
   omp.target_enter_data depend(taskdependin -> %x : !llvm.ptr) {
     omp.terminator
@@ -387,7 +387,7 @@ llvm.func @target_enter_data_depend(%x: !llvm.ptr) {
 // -----
 
 llvm.func @target_exit_data_depend(%x: !llvm.ptr) {
-  // expected-error@below {{depend clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause depend in omp.target_exit_data operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.target_exit_data}}
   omp.target_exit_data depend(taskdependin -> %x : !llvm.ptr) {
     omp.terminator
@@ -398,7 +398,7 @@ llvm.func @target_exit_data_depend(%x: !llvm.ptr) {
 // -----
 
 llvm.func @target_update_depend(%x: !llvm.ptr) {
-  // expected-error@below {{depend clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause depend in omp.target_update operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.target_update}}
   omp.target_update depend(taskdependin -> %x : !llvm.ptr) {
     omp.terminator
@@ -409,7 +409,7 @@ llvm.func @target_update_depend(%x: !llvm.ptr) {
 // -----
 
 llvm.func @task_allocate(%x : !llvm.ptr) {
-  // expected-error@below {{allocate clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause allocate in omp.task operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.task}}
   omp.task allocate(%x : !llvm.ptr -> %x : !llvm.ptr) {
     omp.terminator
@@ -437,7 +437,7 @@ atomic {
   omp.yield
 }
 llvm.func @task_in_reduction(%x : !llvm.ptr) {
-  // expected-error@below {{in_reduction clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause in_reduction in omp.task operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.task}}
   omp.task in_reduction(@add_f32 %x -> %prv : !llvm.ptr) {
     omp.terminator
@@ -448,7 +448,7 @@ llvm.func @task_in_reduction(%x : !llvm.ptr) {
 // -----
 
 llvm.func @task_mergeable() {
-  // expected-error@below {{mergeable clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause mergeable in omp.task operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.task}}
   omp.task mergeable {
     omp.terminator
@@ -459,7 +459,7 @@ llvm.func @task_mergeable() {
 // -----
 
 llvm.func @task_priority(%x : i32) {
-  // expected-error@below {{priority clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause priority in omp.task operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.task}}
   omp.task priority(%x : i32) {
     omp.terminator
@@ -476,7 +476,7 @@ omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
   omp.yield(%1 : !llvm.ptr)
 }
 llvm.func @task_private(%x : !llvm.ptr) {
-  // expected-error@below {{privatization clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause privatization in omp.task operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.task}}
   omp.task private(@x.privatizer %x -> %arg0 : !llvm.ptr) {
     omp.terminator
@@ -487,7 +487,7 @@ llvm.func @task_private(%x : !llvm.ptr) {
 // -----
 
 llvm.func @task_untied() {
-  // expected-error@below {{untied clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause untied in omp.task operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.task}}
   omp.task untied {
     omp.terminator
@@ -498,7 +498,7 @@ llvm.func @task_untied() {
 // -----
 
 llvm.func @taskgroup_allocate(%x : !llvm.ptr) {
-  // expected-error@below {{allocate clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause allocate in omp.taskgroup operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.taskgroup}}
   omp.taskgroup allocate(%x : !llvm.ptr -> %x : !llvm.ptr) {
     omp.terminator
@@ -526,7 +526,7 @@ atomic {
   omp.yield
 }
 llvm.func @taskgroup_task_reduction(%x : !llvm.ptr) {
-  // expected-error@below {{task_reduction clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause task_reduction in omp.taskgroup operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.taskgroup}}
   omp.taskgroup task_reduction(@add_f32 %x -> %prv : !llvm.ptr) {
     omp.terminator
@@ -537,7 +537,7 @@ llvm.func @taskgroup_task_reduction(%x : !llvm.ptr) {
 // -----
 
 llvm.func @taskloop(%lb : i32, %ub : i32, %step : i32) {
-  // expected-error@below {{unsupported OpenMP operation: omp.taskloop}}
+  // expected-error@below {{not yet implemented: omp.taskloop}}
   // expected-error@below {{LLVM Translation failed for operation: omp.taskloop}}
   omp.taskloop {
     omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
@@ -550,7 +550,7 @@ llvm.func @taskloop(%lb : i32, %ub : i32, %step : i32) {
 // -----
 
 llvm.func @taskwait_depend(%x: !llvm.ptr) {
-  // expected-error@below {{depend clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause depend in omp.taskwait operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.taskwait}}
   omp.taskwait depend(taskdependin -> %x : !llvm.ptr) {
     omp.terminator
@@ -561,7 +561,7 @@ llvm.func @taskwait_depend(%x: !llvm.ptr) {
 // -----
 
 llvm.func @taskwait_nowait() {
-  // expected-error@below {{nowait clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause nowait in omp.taskwait operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.taskwait}}
   omp.taskwait nowait {
     omp.terminator
@@ -572,7 +572,7 @@ llvm.func @taskwait_nowait() {
 // -----
 
 llvm.func @teams_allocate(%x : !llvm.ptr) {
-  // expected-error@below {{allocate clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause allocate in omp.teams operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.teams}}
   omp.teams allocate(%x : !llvm.ptr -> %x : !llvm.ptr) {
     omp.terminator
@@ -589,7 +589,7 @@ omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
   omp.yield(%1 : !llvm.ptr)
 }
 llvm.func @teams_private(%x : !llvm.ptr) {
-  // expected-error@below {{privatization clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause privatization in omp.teams operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.teams}}
   omp.teams private(@x.privatizer %x -> %arg0 : !llvm.ptr) {
     omp.terminator
@@ -617,7 +617,7 @@ atomic {
   omp.yield
 }
 llvm.func @teams_reduction(%x : !llvm.ptr) {
-  // expected-error@below {{reduction clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause reduction in omp.teams operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.teams}}
   omp.teams reduction(@add_f32 %x -> %prv : !llvm.ptr) {
     omp.terminator
@@ -628,7 +628,7 @@ llvm.func @teams_reduction(%x : !llvm.ptr) {
 // -----
 
 llvm.func @wsloop_allocate(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
-  // expected-error@below {{allocate clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause allocate in omp.wsloop operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.wsloop}}
   omp.wsloop allocate(%x : !llvm.ptr -> %x : !llvm.ptr) {
     omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
@@ -641,7 +641,7 @@ llvm.func @wsloop_allocate(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
 // -----
 
 llvm.func @wsloop_linear(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
-  // expected-error@below {{linear clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause linear in omp.wsloop operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.wsloop}}
   omp.wsloop linear(%x = %step : !llvm.ptr) {
     omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
@@ -654,7 +654,7 @@ llvm.func @wsloop_linear(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
 // -----
 
 llvm.func @wsloop_order(%lb : i32, %ub : i32, %step : i32) {
-  // expected-error@below {{order clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause order in omp.wsloop operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.wsloop}}
   omp.wsloop order(concurrent) {
     omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
@@ -673,7 +673,7 @@ omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
   omp.yield(%1 : !llvm.ptr)
 }
 llvm.func @wsloop_private(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
-  // expected-error@below {{privatization clause not yet supported}}
+  // expected-error@below {{not yet implemented: Unhandled clause privatization in omp.wsloop operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.wsloop}}
   omp.wsloop private(@x.privatizer %x -> %arg0 : !llvm.ptr) {
     omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
diff --git a/mlir/test/python/ir/array_attributes.py b/mlir/test/python/ir/array_attributes.py
index 256a69a..2bc403a 100644
--- a/mlir/test/python/ir/array_attributes.py
+++ b/mlir/test/python/ir/array_attributes.py
@@ -326,78 +326,6 @@ def testGetDenseElementsF64():
         print(np.array(attr))
 
 
-### 1 bit/boolean integer arrays
-# CHECK-LABEL: TEST: testGetDenseElementsI1Signless
-@run
-def testGetDenseElementsI1Signless():
-    with Context():
-        array = np.array([True], dtype=np.bool_)
-        attr = DenseElementsAttr.get(array)
-        # CHECK: dense<true> : tensor<1xi1>
-        print(attr)
-        # CHECK{LITERAL}: [ True]
-        print(np.array(attr))
-
-        array = np.array([[True, False, True], [True, True, False]], dtype=np.bool_)
-        attr = DenseElementsAttr.get(array)
-        # CHECK{LITERAL}: dense<[[true, false, true], [true, true, false]]> : tensor<2x3xi1>
-        print(attr)
-        # CHECK{LITERAL}: [[ True False True]
-        # CHECK{LITERAL}:  [ True True False]]
-        print(np.array(attr))
-
-        array = np.array(
-            [[True, True, False, False], [True, False, True, False]], dtype=np.bool_
-        )
-        attr = DenseElementsAttr.get(array)
-        # CHECK{LITERAL}: dense<[[true, true, false, false], [true, false, true, false]]> : tensor<2x4xi1>
-        print(attr)
-        # CHECK{LITERAL}: [[ True True False False]
-        # CHECK{LITERAL}:  [ True False True False]]
-        print(np.array(attr))
-
-        array = np.array(
-            [
-                [True, True, False, False],
-                [True, False, True, False],
-                [False, False, False, False],
-                [True, True, True, True],
-                [True, False, False, True],
-            ],
-            dtype=np.bool_,
-        )
-        attr = DenseElementsAttr.get(array)
-        # CHECK{LITERAL}: dense<[[true, true, false, false], [true, false, true, false], [false, false, false, false], [true, true, true, true], [true, false, false, true]]> : tensor<5x4xi1>
-        print(attr)
-        # CHECK{LITERAL}: [[ True True False False]
-        # CHECK{LITERAL}:  [ True False True False]
-        # CHECK{LITERAL}:  [False False False False]
-        # CHECK{LITERAL}:  [ True True True True]
-        # CHECK{LITERAL}:  [ True False False True]]
-        print(np.array(attr))
-
-        array = np.array(
-            [
-                [True, True, False, False, True, True, False, False, False],
-                [False, False, False, True, False, True, True, False, True],
-            ],
-            dtype=np.bool_,
-        )
-        attr = DenseElementsAttr.get(array)
-        # CHECK{LITERAL}: dense<[[true, true, false, false, true, true, false, false, false], [false, false, false, true, false, true, true, false, true]]> : tensor<2x9xi1>
-        print(attr)
-        # CHECK{LITERAL}: [[ True True False False True True False False False]
-        # CHECK{LITERAL}:  [False False False True False True True False True]]
-        print(np.array(attr))
-
-        array = np.array([], dtype=np.bool_)
-        attr = DenseElementsAttr.get(array)
-        # CHECK: dense<> : tensor<0xi1>
-        print(attr)
-        # CHECK{LITERAL}: []
-        print(np.array(attr))
-
-
 ### 16 bit integer arrays
 # CHECK-LABEL: TEST: testGetDenseElementsI16Signless
 @run
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
index 452f39d..fbdada9 100644
--- a/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
+++ b/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
@@ -1,4 +1,4 @@
-//===- OpenACCOpsTest.cpp - OpenACC ops extra functiosn Tests -------------===//
+//===- OpenACCOpsTest.cpp - Unit tests for OpenACC ops --------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/MLIRContext.h"
@@ -23,7 +24,8 @@ using namespace mlir::acc;
 class OpenACCOpsTest : public ::testing::Test {
 protected:
   OpenACCOpsTest() : b(&context), loc(UnknownLoc::get(&context)) {
-    context.loadDialect<acc::OpenACCDialect, arith::ArithDialect>();
+    context.loadDialect<acc::OpenACCDialect, arith::ArithDialect,
+                        memref::MemRefDialect>();
   }
 
   MLIRContext context;
@@ -436,3 +438,169 @@ TEST_F(OpenACCOpsTest, routineOpTest) {
   op->removeBindNameDeviceTypeAttr();
   op->removeBindNameAttr();
 }
+
+template <typename Op>
+void testShortDataEntryOpBuilders(OpBuilder &b, MLIRContext &context,
+                                  Location loc, DataClause dataClause) {
+  auto memrefTy = MemRefType::get({}, b.getI32Type());
+  OwningOpRef<memref::AllocaOp> varPtrOp =
+      b.create<memref::AllocaOp>(loc, memrefTy);
+
+  OwningOpRef<Op> op = b.create<Op>(loc, varPtrOp->getResult(),
+                                    /*structured=*/true, /*implicit=*/true);
+
+  EXPECT_EQ(op->getVarPtr(), varPtrOp->getResult());
+  EXPECT_EQ(op->getType(), memrefTy);
+  EXPECT_EQ(op->getDataClause(), dataClause);
+  EXPECT_TRUE(op->getImplicit());
+  EXPECT_TRUE(op->getStructured());
+  EXPECT_TRUE(op->getBounds().empty());
+  EXPECT_FALSE(op->getVarPtrPtr());
+
+  OwningOpRef<Op> op2 = b.create<Op>(loc, varPtrOp->getResult(),
+                                     /*structured=*/false, /*implicit=*/false);
+  EXPECT_FALSE(op2->getImplicit());
+  EXPECT_FALSE(op2->getStructured());
+
+  OwningOpRef<arith::ConstantIndexOp> extent =
+      b.create<arith::ConstantIndexOp>(loc, 1);
+  OwningOpRef<DataBoundsOp> bounds =
+      b.create<DataBoundsOp>(loc, extent->getResult());
+  OwningOpRef<Op> opWithBounds =
+      b.create<Op>(loc, varPtrOp->getResult(),
+                   /*structured=*/true, /*implicit=*/true, bounds->getResult());
+  EXPECT_FALSE(opWithBounds->getBounds().empty());
+  EXPECT_EQ(opWithBounds->getBounds().back(), bounds->getResult());
+
+  OwningOpRef<Op> opWithName =
+      b.create<Op>(loc, varPtrOp->getResult(),
+                   /*structured=*/true, /*implicit=*/true, "varName");
+  EXPECT_EQ(opWithName->getNameAttr().str(), "varName");
+}
+
+TEST_F(OpenACCOpsTest, shortDataEntryOpBuilder) {
+  testShortDataEntryOpBuilders<PrivateOp>(b, context, loc,
+                                          DataClause::acc_private);
+  testShortDataEntryOpBuilders<FirstprivateOp>(b, context, loc,
+                                               DataClause::acc_firstprivate);
+  testShortDataEntryOpBuilders<ReductionOp>(b, context, loc,
+                                            DataClause::acc_reduction);
+  testShortDataEntryOpBuilders<DevicePtrOp>(b, context, loc,
+                                            DataClause::acc_deviceptr);
+  testShortDataEntryOpBuilders<PresentOp>(b, context, loc,
+                                          DataClause::acc_present);
+  testShortDataEntryOpBuilders<CopyinOp>(b, context, loc,
+                                         DataClause::acc_copyin);
+  testShortDataEntryOpBuilders<CreateOp>(b, context, loc,
+                                         DataClause::acc_create);
+  testShortDataEntryOpBuilders<NoCreateOp>(b, context, loc,
+                                           DataClause::acc_no_create);
+  testShortDataEntryOpBuilders<AttachOp>(b, context, loc,
+                                         DataClause::acc_attach);
+  testShortDataEntryOpBuilders<GetDevicePtrOp>(b, context, loc,
+                                               DataClause::acc_getdeviceptr);
+  testShortDataEntryOpBuilders<UpdateDeviceOp>(b, context, loc,
+                                               DataClause::acc_update_device);
+  testShortDataEntryOpBuilders<UseDeviceOp>(b, context, loc,
+                                            DataClause::acc_use_device);
+  testShortDataEntryOpBuilders<DeclareDeviceResidentOp>(
+      b, context, loc, DataClause::acc_declare_device_resident);
+  testShortDataEntryOpBuilders<DeclareLinkOp>(b, context, loc,
+                                              DataClause::acc_declare_link);
+  testShortDataEntryOpBuilders<CacheOp>(b, context, loc, DataClause::acc_cache);
+}
+
+template <typename Op>
+void testShortDataExitOpBuilders(OpBuilder &b, MLIRContext &context,
+                                 Location loc, DataClause dataClause) {
+  auto memrefTy = MemRefType::get({}, b.getI32Type());
+  OwningOpRef<memref::AllocaOp> varPtrOp =
+      b.create<memref::AllocaOp>(loc, memrefTy);
+  OwningOpRef<GetDevicePtrOp> accPtrOp = b.create<GetDevicePtrOp>(
+      loc, varPtrOp->getResult(), /*structured=*/true, /*implicit=*/true);
+
+  OwningOpRef<Op> op =
+      b.create<Op>(loc, accPtrOp->getResult(), varPtrOp->getResult(),
+                   /*structured=*/true, /*implicit=*/true);
+
+  EXPECT_EQ(op->getVarPtr(), varPtrOp->getResult());
+  EXPECT_EQ(op->getAccPtr(), accPtrOp->getResult());
+  EXPECT_EQ(op->getDataClause(), dataClause);
+  EXPECT_TRUE(op->getImplicit());
+  EXPECT_TRUE(op->getStructured());
+  EXPECT_TRUE(op->getBounds().empty());
+
+  OwningOpRef<Op> op2 =
+      b.create<Op>(loc, accPtrOp->getResult(), varPtrOp->getResult(),
+                   /*structured=*/false, /*implicit=*/false);
+  EXPECT_FALSE(op2->getImplicit());
+  EXPECT_FALSE(op2->getStructured());
+
+  OwningOpRef<arith::ConstantIndexOp> extent =
+      b.create<arith::ConstantIndexOp>(loc, 1);
+  OwningOpRef<DataBoundsOp> bounds =
+      b.create<DataBoundsOp>(loc, extent->getResult());
+  OwningOpRef<Op> opWithBounds =
+      b.create<Op>(loc, accPtrOp->getResult(), varPtrOp->getResult(),
+                   /*structured=*/true, /*implicit=*/true, bounds->getResult());
+  EXPECT_FALSE(opWithBounds->getBounds().empty());
+  EXPECT_EQ(opWithBounds->getBounds().back(), bounds->getResult());
+
+  OwningOpRef<Op> opWithName =
+      b.create<Op>(loc, accPtrOp->getResult(), varPtrOp->getResult(),
+                   /*structured=*/true, /*implicit=*/true, "varName");
+  EXPECT_EQ(opWithName->getNameAttr().str(), "varName");
+}
+
+TEST_F(OpenACCOpsTest, shortDataExitOpBuilder) {
+  testShortDataExitOpBuilders<CopyoutOp>(b, context, loc,
+                                         DataClause::acc_copyout);
+  testShortDataExitOpBuilders<UpdateHostOp>(b, context, loc,
+                                            DataClause::acc_update_host);
+}
+
+template <typename Op>
+void testShortDataExitNoVarPtrOpBuilders(OpBuilder &b, MLIRContext &context,
+                                         Location loc, DataClause dataClause) {
+  auto memrefTy = MemRefType::get({}, b.getI32Type());
+  OwningOpRef<memref::AllocaOp> varPtrOp =
+      b.create<memref::AllocaOp>(loc, memrefTy);
+  OwningOpRef<GetDevicePtrOp> accPtrOp = b.create<GetDevicePtrOp>(
+      loc, varPtrOp->getResult(), /*structured=*/true, /*implicit=*/true);
+
+  OwningOpRef<Op> op = b.create<Op>(loc, accPtrOp->getResult(),
+                                    /*structured=*/true, /*implicit=*/true);
+
+  EXPECT_EQ(op->getAccPtr(), accPtrOp->getResult());
+  EXPECT_EQ(op->getDataClause(), dataClause);
+  EXPECT_TRUE(op->getImplicit());
+  EXPECT_TRUE(op->getStructured());
+  EXPECT_TRUE(op->getBounds().empty());
+
+  OwningOpRef<Op> op2 = b.create<Op>(loc, accPtrOp->getResult(),
+                                     /*structured=*/false, /*implicit=*/false);
+  EXPECT_FALSE(op2->getImplicit());
+  EXPECT_FALSE(op2->getStructured());
+
+  OwningOpRef<arith::ConstantIndexOp> extent =
+      b.create<arith::ConstantIndexOp>(loc, 1);
+  OwningOpRef<DataBoundsOp> bounds =
+      b.create<DataBoundsOp>(loc, extent->getResult());
+  OwningOpRef<Op> opWithBounds =
+      b.create<Op>(loc, accPtrOp->getResult(),
+                   /*structured=*/true, /*implicit=*/true, bounds->getResult());
+  EXPECT_FALSE(opWithBounds->getBounds().empty());
+  EXPECT_EQ(opWithBounds->getBounds().back(), bounds->getResult());
+
+  OwningOpRef<Op> opWithName =
+      b.create<Op>(loc, accPtrOp->getResult(),
+                   /*structured=*/true, /*implicit=*/true, "varName");
+  EXPECT_EQ(opWithName->getNameAttr().str(), "varName");
+}
+
+TEST_F(OpenACCOpsTest, shortDataExitOpNoVarPtrBuilder) {
+  testShortDataExitNoVarPtrOpBuilders<DeleteOp>(b, context, loc,
+                                                DataClause::acc_delete);
+  testShortDataExitNoVarPtrOpBuilders<DetachOp>(b, context, loc,
+                                                DataClause::acc_detach);
+}
diff --git a/mlir/unittests/IR/AffineMapTest.cpp b/mlir/unittests/IR/AffineMapTest.cpp
index 081afad..eaeb18d 100644
--- a/mlir/unittests/IR/AffineMapTest.cpp
+++ b/mlir/unittests/IR/AffineMapTest.cpp
@@ -21,3 +21,58 @@ TEST(AffineMapTest, inferMapFromAffineExprs) {
   map.replace(replacements);
   EXPECT_EQ(map, map);
 }
+
+TEST(AffineMapTest, isProjectedPermutation) {
+  MLIRContext ctx;
+  OpBuilder b(&ctx);
+
+  // 1. Empty map
+  AffineMap map1 = b.getEmptyAffineMap();
+  EXPECT_TRUE(map1.isProjectedPermutation());
+
+  // 2. Map with a symbol
+  AffineMap map2 = AffineMap::get(0, 1, &ctx);
+  EXPECT_FALSE(map2.isProjectedPermutation());
+
+  // 3. The result map is {0} and zero results are _allowed_.
+  auto zero = b.getAffineConstantExpr(0);
+  AffineMap map3 = AffineMap::get(1, 0, {zero}, &ctx);
+  EXPECT_TRUE(map3.isProjectedPermutation(/*allowZeroInResults=*/true));
+
+  // 4. The result map is {0} and zero results are _not allowed_
+  AffineMap map4 = AffineMap::get(1, 0, {zero}, &ctx);
+  EXPECT_FALSE(map4.isProjectedPermutation(/*allowZeroInResults=*/false));
+
+  // 5. The number of results > inputs
+  AffineMap map5 = AffineMap::get(1, 0, {zero, zero}, &ctx);
+  EXPECT_FALSE(map5.isProjectedPermutation(/*allowZeroInResults=*/true));
+
+  // 6. A constant result that's not a {0}
+  auto one = b.getAffineConstantExpr(1);
+  AffineMap map6 = AffineMap::get(1, 0, {one}, &ctx);
+  EXPECT_FALSE(map6.isProjectedPermutation(/*allowZeroInResults=*/true));
+
+  // 7. Not a dim expression
+  auto d0 = b.getAffineDimExpr(0);
+  auto d1 = b.getAffineDimExpr(1);
+
+  auto sum = d0 + d1;
+  AffineMap map7 = AffineMap::get(2, 0, {sum}, &ctx);
+  EXPECT_FALSE(map7.isProjectedPermutation());
+
+  // 8. (d0, d1, d2, d3, d4, d5) ->(d5, d3, d0, d1, d2, d4)
+  auto d2 = b.getAffineDimExpr(2);
+  auto d3 = b.getAffineDimExpr(3);
+  auto d4 = b.getAffineDimExpr(4);
+  auto d5 = b.getAffineDimExpr(5);
+  AffineMap map8 = AffineMap::get(6, 0, {d5, d3, d0, d1, d2, d4}, &ctx);
+  EXPECT_TRUE(map8.isProjectedPermutation());
+
+  // 9. (d0, d1, d2, d3, d4, d5) ->(d5, d3, d0 + d1, d2, d4)
+  AffineMap map9 = AffineMap::get(6, 0, {d5, d3, sum, d2, d4}, &ctx);
+  EXPECT_FALSE(map9.isProjectedPermutation());
+
+  // 10. (d0, d1, d2, d3, d4, d5) ->(d5, d3, d2, d4)
+  AffineMap map10 = AffineMap::get(6, 0, {d5, d3, d2, d4}, &ctx);
+  EXPECT_TRUE(map10.isProjectedPermutation());
+}
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index 6aa34a54..91e7e01 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -1020,6 +1020,7 @@ cc_library(
         ":basic",
         ":basic_builtins_gen",
         ":lex",
+        ":support",
         ":type_nodes_gen",
         "//llvm:BinaryFormat",
         "//llvm:Core",
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index c9a338a..f6648c9 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -529,7 +529,7 @@ libc_support_library(
         ":__support_macros_properties_os",
         ":func_aligned_alloc",
         ":func_free",
-        ":func_malloc"
+        ":func_malloc",
     ],
 )
 
@@ -576,6 +576,9 @@ libc_support_library(
         ":__support_common",
         ":__support_cpp_string_view",
         ":__support_integer_to_string",
+        ":func_free",
+        ":func_malloc",
+        ":func_realloc",
         ":string_memory_utils",
         ":string_utils",
     ],
@@ -657,6 +660,9 @@ libc_support_library(
     hdrs = ["src/__support/char_vector.h"],
     deps = [
         ":__support_common",
+        ":func_free",
+        ":func_malloc",
+        ":func_realloc",
     ],
 )
 
@@ -861,13 +867,13 @@ libc_support_library(
         ":__support_error_or",
         ":__support_threads_mutex",
         ":errno",
-        ":hdr_stdio_macros",
-        ":hdr_stdio_overlay",
-        ":types_off_t",
         ":func_aligned_alloc",
         ":func_free",
         ":func_malloc",
         ":func_realloc",
+        ":hdr_stdio_macros",
+        ":hdr_stdio_overlay",
+        ":types_off_t",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index e99d6fc..20447d59 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -23,6 +23,7 @@ licenses(["notice"])
 exports_files([
     "LICENSE.TXT",
     "cmake/modules/llvm-driver-template.cpp.in",
+    "include/llvm/BinaryFormat/Dwarf.def",
     "include/llvm/CodeGen/SDNodeProperties.td",
     "include/llvm/CodeGen/ValueTypes.td",
     "include/llvm/Frontend/Directive/DirectiveBase.td",
author	Vitaly Buka <vitalybuka@google.com>	2024-11-05 09:21:53 -0800
committer	Vitaly Buka <vitalybuka@google.com>	2024-11-05 09:21:53 -0800
commit	f27002c4fac12dcf793fab4f6c434e14ca31bdcc (patch)
tree	fd249e8e56313dfcbed2a72a896ab60b40cef976
parent	f36f3721c603c122bdc5092276efef00a9a77766 (diff)
parent	1e50958399e0bb2a558a5d5806a61da9b2ef9e74 (diff)
download	llvm-users/vitalybuka/spr/main.nfctsan-eliminate-a-few-macros.zip llvm-users/vitalybuka/spr/main.nfctsan-eliminate-a-few-macros.tar.gz llvm-users/vitalybuka/spr/main.nfctsan-eliminate-a-few-macros.tar.bz2